Page 1 of 2 12 LastLast
Results 1 to 20 of 37

Thread: Lucene issue

  1. #1
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default Lucene issue

    Java Code:
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.queryParser.ParseException;
    import org.apache.lucene.queryParser.QueryParser;
    import org.apache.lucene.search.Hits;
    import org.apache.lucene.index.*;
    import java.io.*;
    import java.util.ArrayList;
    import java.util.Iterator;
    import java.util.Scanner;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.search.Hit;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.store.RAMDirectory;
    
    /**
     * This terminal application creates an Apache Lucene index in a folder and adds files into this index
     * based on the input of the user.
     */
    public class TextFileIndexer {
    
        //declare all the fields
        
        public static final String FIELD_PATH = "path";
        public static final String FIELD_CONTENTS = "contents";
        public static final String FIELD_ID = "docno";
        public static final String FIELD_DATE = "date";
    
        private IndexWriter writer;
        private ArrayList<File> queue = new ArrayList<File>();
    
        @SuppressWarnings("static-access")
        public static void main(String[] args) throws IOException, ParseException {
    
            String s = null;
            Scanner sc = new Scanner(System.in);
            //Menu selections
            int choice;
            do {
                System.out.println("Welcome Search Engine. Please choose" + " your selections below\n" + "(1) Build index collection\n" + "(2) Search for the documents ids\n" + "(3) Exit\n");
    
                System.out.print("Enter your choice: ");
                choice = sc.nextInt();
    
                //switch statements
                switch (choice) {
    
                    case 1:
    
                        System.out.println("Enter the path where the index will be created: ");
    
                        BufferedReader br = new BufferedReader(
                                new InputStreamReader(System.in));
                        s = br.readLine();
    
                        TextFileIndexer indexer = null;
                        try {
                            indexer = new TextFileIndexer(s);
                        } catch (Exception ex) {
                            System.out.println("Cannot create index..." + ex.getMessage());
                            System.exit(-1);
                        }
                        try {
                            System.out.println("Enter the file or folder name to add into the index (q=quit):");
                            System.out.println("[Acceptable file types: .xml, .html, .html, .txt]");
                            s = br.readLine();
                            //try to add file into the index
                            indexer.indexFileOrDirectory(s);
                        } catch (Exception e) {
                            System.out.println("Error indexing " + s + " : " + e.getMessage());
                        }
                        indexer.closeIndex();
    
                        break;
    
                    case 2:
    
                        System.out.println("your query?");
                        br = new BufferedReader(
                                new InputStreamReader(System.in));
                        String a = br.readLine();
                        
                       searchIndex(a);
                        break;
                    case 3:
    
                        //exit the program
                        System.out.println("Program exiting..");
                        break;
    
                    default:
    
                        //display invalid selection
                        System.err.println("Err: Invalid selection");
                }
            } while (choice != 3);
    
        }
    
    
        public TextFileIndexer(String index) throws IOException, ParseException {
            // the boolean true parameter means to create a new index everytime,
            // potentially overwriting any existing files there.
            writer = new IndexWriter(index, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
        }
    
        public static void searchIndex(String words) throws IOException, ParseException{
    
            System.out.println("Searching for '" + words + "'"  );
            [B]Directory directory = FSDirectory.getDirectory(INDEX_COLLECTION);[/B]
            IndexReader indexReader = IndexReader.open(directory);
            IndexSearcher indexSearcher = new IndexSearcher(indexReader);
    
            Analyzer analyzer = new StandardAnalyzer();
            QueryParser queryParser = new QueryParser(FIELD_ID, analyzer);
            Query query = queryParser.parse(words);
            Hits hits = indexSearcher.search(query);
            System.out.println("Number of hits: " + hits.length());
    
            Iterator<Hit> it = hits.iterator();
            while (it.hasNext()) {
                Hit hit = it.next();
                Document document = hit.getDocument();
                String text = document.get(FIELD_ID);
                System.out.println("Hit: " + text);
            }
    
        }
    
        public void indexFileOrDirectory(String fileName) throws IOException {
    
            listFiles(new File(fileName));
    
            int originalNumDocs = writer.numDocs();
            for (File file : queue) {
                FileReader fr = null;
                try {
                    Document doc = new Document();
                    fr = new FileReader(file);
                    doc.add(new Field(FIELD_CONTENTS, fr));
    
                    String path = file.getCanonicalPath();
                    doc.add(new Field(FIELD_PATH, path, Field.Store.YES, Field.Index.UN_TOKENIZED));
    
                    String docno = file.getName();
                    doc.add(new Field(FIELD_ID, docno, Field.Store.YES, Field.Index.UN_TOKENIZED));
    
                    String date = file.getPath();
                    doc.add(new Field(FIELD_DATE, date, Field.Store.YES, Field.Index.UN_TOKENIZED));
    
                    writer.addDocument(doc);
                    System.out.println("Added: " + file);
    
    
                } catch (Exception e) {
                    System.out.println("Could not add: " + file);
                } finally {
                    fr.close();
                }
            }
    
            int newNumDocs = writer.numDocs();
            System.out.println("");
            System.out.println("************************");
            System.out.println((newNumDocs - originalNumDocs) + " documents added.");
            System.out.println("************************");
    
            queue.clear();
        }
    
        private void listFiles(File file) {
            if (!file.exists()) {
                System.out.println(file + " does not exist.");
            }
            if (file.isDirectory()) {
                for (File f : file.listFiles()) {
                    listFiles(f);
                }
            } else {
                String filename = file.getName().toLowerCase();
              
                if (filename.endsWith(".htm") || filename.endsWith(".html") ||
                        filename.endsWith(".xml") || filename.endsWith(".txt")) {
                    queue.add(file);
                } else {
                    System.out.println("Skipped " + filename);
                }
            }
        }
    
        public void closeIndex() throws IOException {
            writer.optimize();
            writer.close();
        }
    
         
    }
    hi guys,

    the problem is in bold. what i would like to have is to get the directory of my files.thanks. but i unable to do so.

  2. #2
    Petr's Avatar
    Petr is offline Senior Member
    Join Date
    Jan 2011
    Location
    Russia
    Posts
    620
    Rep Power
    4

    Default

    Sorry You are very difficult understand what you want.
    Do you want to index a special folder?
    Skype: petrarsentev
    http://TrackStudio.com

  3. #3
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    hi,

    yes.its a special folder which consist of "tdt3/19982001/(and the text files). how should i be able to get the directory of this ?thanks.i am able to build the index. but when i try to do a search index, i dont know how to get the directory. i try manually but its said "access is denied" thanks.

  4. #4
    Petr's Avatar
    Petr is offline Senior Member
    Join Date
    Jan 2011
    Location
    Russia
    Posts
    620
    Rep Power
    4

    Default

    see
    Java Code:
    package com.action;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.queryParser.QueryParser;
    import org.apache.lucene.search.*;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    
    import java.io.File;
    import java.io.FileReader;
    import java.io.IOException;
    import java.io.Reader;
    
    public class LuceneHtml {
    
    	public static final String FILES_TO_INDEX_DIRECTORY = "filesToIndex";
    	public static final String INDEX_DIRECTORY = "indexDirectory";
    
    	public static final String FIELD_PATH = "path";
    	public static final String FIELD_CONTENTS = "contents";
    
    	public static void main(String[] args) throws Exception {
    		createIndex();
    		searchIndex("msg");
    	}
    
    	public static void createIndex() throws IOException {
    		Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
    		File index = new File(INDEX_DIRECTORY);
    		IndexWriter indexWriter = new IndexWriter(FSDirectory.open(index), analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
    		File dir = new File(FILES_TO_INDEX_DIRECTORY);
    		File[] files = dir.listFiles();
    		for (File file : files) {
    			Document document = new Document();
    			String path = file.getCanonicalPath();
    			document.add(new Field(FIELD_PATH, path, Field.Store.YES, Field.Index.ANALYZED));
    			Reader reader = new FileReader(file);
    			document.add(new Field(FIELD_CONTENTS, reader));
    			indexWriter.addDocument(document);
    		}
    		indexWriter.optimize();
    		indexWriter.close();
    	}
    
    	public static void searchIndex(String searchString) throws Exception {
    		System.out.println("Searching for '" + searchString + "'");
            TopDocsCollector collector = TopFieldCollector.create(new Sort(new SortField(null, SortField.DOC, true)), 100, false, false, false, false);
    		Directory directory = FSDirectory.open(new File(INDEX_DIRECTORY));
    		IndexReader indexReader = IndexReader.open(directory);
    		IndexSearcher indexSearcher = new IndexSearcher(indexReader);
    
    		Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
    		QueryParser queryParser = new QueryParser(Version.LUCENE_30, FIELD_CONTENTS, analyzer);
    		Query query = queryParser.parse(searchString);
            indexSearcher.search(query, collector);
    		ScoreDoc[] hits = collector.topDocs().scoreDocs;
    		System.out.println("Number of hits: " + hits.length);
    
    		for (ScoreDoc hit : hits) {
    			Document document = indexSearcher.doc(hit.doc);
    			String path = document.get(FIELD_PATH);
    			System.out.println("Hit: " + path);
    		}
    	}
    }
    Skype: petrarsentev
    http://TrackStudio.com

  5. #5
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    hi,

    may i know what lucene 3.0 is?thanks.

  6. #6
    Petr's Avatar
    Petr is offline Senior Member
    Join Date
    Jan 2011
    Location
    Russia
    Posts
    620
    Rep Power
    4

    Default

    I use last version Lucune 3.0.3.
    see Version (Lucene 3.0.1 API)
    What version do you use?
    Skype: petrarsentev
    http://TrackStudio.com

  7. #7
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    hi,

    tried your codes but i still have access denied. This tdt3 has many sub directories inside

    tdt3/1998/1.txt, tdt3/1999/1.txt (contain many date files)

  8. #8
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    hi,

    the error : Exception in thread "main" java.io.FileNotFoundException: tdt3\19981001 (Access is denied)

  9. #9
    Petr's Avatar
    Petr is offline Senior Member
    Join Date
    Jan 2011
    Location
    Russia
    Posts
    620
    Rep Power
    4

    Default

    Can you show a full structure your project?
    This is structure my project
    Java Code:
    .
    |-- filesToIndex
    |   `-- overview.html
    |-- indexDirectory
    |   |-- _0.cfs
    |   |-- _0.cfx
    |   |-- segments_2
    |   `-- segments.gen
    |-- lib
    |   |-- lucene-core-3.0.3.jar
    |   `-- lucene-highlighter-3.0.3.jar
    |-- LuceneHtml.iml
    |-- out
    |   |-- production
    |   |   `-- LuceneHtml
    |   |       `-- com
    |   |           `-- action
    |   |               `-- LuceneHtml.class
    |   `-- test
    |       `-- LuceneHtml
    `-- src
        `-- com
            `-- action
                `-- LuceneHtml.java
    Last edited by Petr; 03-16-2011 at 04:57 PM.
    Skype: petrarsentev
    http://TrackStudio.com

  10. #10
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    hi,

    hmm..is you dont mind, i have inserted a link of pictures to show the structures of the document collection. thanks.[IMG]1.jpeg[/IMG] [IMG]2.jpeg[/IMG]

  11. #11
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

  12. #12
    Petr's Avatar
    Petr is offline Senior Member
    Join Date
    Jan 2011
    Location
    Russia
    Posts
    620
    Rep Power
    4

    Default

    Ok. I looked your pictures. So you just can use recursive for reading all structure index folders. It's easy. if you want I would to write this code for you.
    Skype: petrarsentev
    http://TrackStudio.com

  13. #13
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    hi,

    how should i go about doin the recursive?and able to add in the files as in the code

    Directory directory = FSDirectory.getDirectory(INDEX_COLLECTION); or something that makes me able to do a search index. i know the building part is right, just cant connect it to the search index without passing parameter. cant think of any other way. thanks

  14. #14
    Petr's Avatar
    Petr is offline Senior Member
    Join Date
    Jan 2011
    Location
    Russia
    Posts
    620
    Rep Power
    4

    Default

    Yeah I looked your code. You used a recursive. so you have a collection of files for index.
    then you index each file. That is all.
    for example
    Java Code:
    for (File file : files) {
        createIndex(file);
    }
    Java Code:
    public static void createIndex(File index) throws IOException {
    		Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
    		IndexWriter indexWriter = new IndexWriter(FSDirectory.open(index), analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
    		File dir = new File(FILES_TO_INDEX_DIRECTORY);
    		File[] files = dir.listFiles();
    		for (File file : files) {
    			Document document = new Document();
    			String path = file.getCanonicalPath();
    			document.add(new Field(FIELD_PATH, path, Field.Store.YES, Field.Index.ANALYZED));
    			Reader reader = new FileReader(file);
    			document.add(new Field(FIELD_CONTENTS, reader));
    			indexWriter.addDocument(document);
    		}
    		indexWriter.optimize();
    		indexWriter.close();
    	}
    Skype: petrarsentev
    http://TrackStudio.com

  15. #15
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    hi,

    Java Code:
    for (File file : files) {
        createIndex(file);
    }
    i only have this for loop at the createIndex() method only. Do you mean i supposed to put it inside?

  16. #16
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    hi,

    what argument should i put at the createIndex(..) at the main method?thanks.

  17. #17
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    hi,

    Java Code:
    public static void createIndex() throws CorruptIndexException, LockObtainFailedException, IOException {
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_24);
            IndexWriter indexWriter = new IndexWriter(INDEX_COLLECTION, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
            File dir = new File(FILES_TO_INDEX_DIRECTORY);
           
            FilenameFilter select = new FileListFilter("txt");
            File[] folder = dir.listFiles();
            //File[] files = dirsub;
            for (int i = 0; i< folder.length;i++) {
                folder[i].getName();
                File[] sub = folder[i].listFiles();
                for(File subfiles: sub) {
               
               Document document = new Document();
    
                String path = subfiles.getCanonicalPath();
                document.add(new Field(FIELD_PATH, path, Field.Store.YES, Field.Index.UN_TOKENIZED));
    
                String docno = subfiles.getName();
                document.add(new Field(FIELD_ID, docno, Field.Store.YES, Field.Index.UN_TOKENIZED));
    
                String date = subfiles.getPath();
                document.add(new Field(FIELD_DATE, date, Field.Store.YES, Field.Index.UN_TOKENIZED));
    
                Reader reader = new FileReader(subfiles);
                document.add(new Field(FIELD_CONTENTS, reader));
    
                indexWriter.addDocument(document);
                }
            }
            indexWriter.optimize();
            indexWriter.close();
        
        }
    the lucene running and not doin anything

    and my main method

    Java Code:
    public static void main(String[] args) throws Exception {
    
    
            createIndex();
            System.out.println("search");
            BufferedReader br = new BufferedReader(
                    new InputStreamReader(System.in));
            String s = br.readLine();
            searchIndex(s);
    
        }

  18. #18
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    I cant get the query. this is my codes. somehow it returned empty hits.

    Java Code:
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileReader;
    import java.io.FilenameFilter;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.io.Reader;
    import java.util.Iterator;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.index.CorruptIndexException;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.queryParser.ParseException;
    import org.apache.lucene.queryParser.QueryParser;
    import org.apache.lucene.search.Hit;
    import org.apache.lucene.search.Hits;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.store.LockObtainFailedException;
    import org.apache.lucene.util.Version;
    
    public class SearchEngine {
    
        //declare all the fields
        public static final String FILES_TO_INDEX_DIRECTORY = "tdt3/";
        public static final String INDEX_COLLECTION = "indexDirectory";
        public static final String FIELD_PATH = "path";
        public static final String FIELD_CONTENTS = "contents";
        public static final String FIELD_ID = "docno";
        public static final String FIELD_DATE = "date";
    
        public static void main(String[] args) throws Exception {
    
    
            //createIndex();
            System.out.println("search");
            BufferedReader br = new BufferedReader(
                    new InputStreamReader(System.in));
            String s = br.readLine();
            searchIndex(s);
    
        }
    
        public static void createIndex() throws CorruptIndexException, LockObtainFailedException, IOException {
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_24);
            IndexWriter indexWriter = new IndexWriter(INDEX_COLLECTION, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
            File dir = new File(FILES_TO_INDEX_DIRECTORY);
    
            FilenameFilter select = new FileListFilter("txt");
            File[] folder = dir.listFiles();
    
            for (int i = 0; i < folder.length; i++) {
                File[] sub = folder[i].listFiles(select);
                for (File subfiles : sub) {
    
    
                    Document document = new Document();
    
                    String path = subfiles.getPath();
                    document.add(new Field(FIELD_PATH, path, Field.Store.YES, Field.Index.UN_TOKENIZED));
    
                    String docno = subfiles.getName();
                    document.add(new Field(FIELD_ID, docno, Field.Store.YES, Field.Index.UN_TOKENIZED));
    
                    String date = subfiles.getPath();
                    document.add(new Field(FIELD_DATE, date, Field.Store.YES, Field.Index.UN_TOKENIZED));
    
                    BufferedReader reader = new BufferedReader(new FileReader(subfiles));
                    document.add(new Field(FIELD_CONTENTS, reader));
                    System.out.println("added" + subfiles);
    
                    indexWriter.addDocument(document);
                }
            }
            indexWriter.optimize();
            indexWriter.close();
    
        }
    
        public static void searchIndex(String words) throws IOException, ParseException {
            System.out.println("Searching for '" + words + "'");
            FSDirectory.open(new File("tdt3/"));
            //IndexReader indexReader = IndexReader.open(directory);
            IndexSearcher indexSearcher = new IndexSearcher(INDEX_COLLECTION);
    
            Analyzer analyzer = new StandardAnalyzer();
            QueryParser queryParser = new QueryParser(FIELD_PATH, analyzer);
            Query query = queryParser.parse(words);
            Hits hits = indexSearcher.search(query);
            System.out.println("Number of hits: " + hits.length());
    
            Iterator<Hit> it = hits.iterator();
            while (it.hasNext()) {
                Hit hit = it.next();
                Document document = hit.getDocument();
                String text = document.get(FIELD_PATH);
                System.out.println("Hit: " + text);
            }
    
        }
    
        public static class FileListFilter implements FilenameFilter {
    
            private String extension;
    
            public FileListFilter(String extension) {
    
                this.extension = extension;
            }
    
            public boolean accept(File directory, String filename) {
                boolean fileOK = true;
    
    
                if (extension != null) {
                    fileOK &= filename.endsWith('.' + extension);
                }
                return fileOK;
            }
        }
    }
    thanks for the feedback, i am still trying to spot where i did wrong.

  19. #19
    Petr's Avatar
    Petr is offline Senior Member
    Join Date
    Jan 2011
    Location
    Russia
    Posts
    620
    Rep Power
    4

    Default

    I advise you to figure out how to make index for one file. Then you understand how you can index all files in index folder.
    Skype: petrarsentev
    http://TrackStudio.com

  20. #20
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    hi petr,

    i have tested with one file only and it worked. and the problem yesterday regarding reading subfolder also has been solved. i am using nested loop to read the folder and its subfolder.

    but now i am trying to search the index but the number of hits returned 0.

Page 1 of 2 12 LastLast

Similar Threads

  1. Lucene beginner. HELP!!!
    By adbawany in forum Lucene
    Replies: 2
    Last Post: 01-22-2011, 11:18 AM
  2. Monitoring lucene
    By arumilli in forum Lucene
    Replies: 0
    Last Post: 08-05-2010, 05:40 PM
  3. Indexing XML using lucene
    By peliukasss in forum Lucene
    Replies: 0
    Last Post: 03-28-2010, 10:20 PM
  4. Apache Lucene
    By dkarthiin in forum Lucene
    Replies: 0
    Last Post: 03-25-2010, 12:25 PM
  5. Apache Lucene 2.3.2
    By Java Tip in forum Java Software
    Replies: 0
    Last Post: 05-08-2008, 06:49 PM

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •