Page 1 of 2 12 LastLast
Results 1 to 20 of 37

Thread: Lucene issue

  1. #1
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default Lucene issue

    Java Code:
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.queryParser.ParseException;
    import org.apache.lucene.queryParser.QueryParser;
    import org.apache.lucene.search.Hits;
    import org.apache.lucene.index.*;
    import java.io.*;
    import java.util.ArrayList;
    import java.util.Iterator;
    import java.util.Scanner;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.search.Hit;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.store.RAMDirectory;
    
    /**
     * This terminal application creates an Apache Lucene index in a folder and adds files into this index
     * based on the input of the user.
     */
    public class TextFileIndexer {
    
        //declare all the fields
        
        public static final String FIELD_PATH = "path";
        public static final String FIELD_CONTENTS = "contents";
        public static final String FIELD_ID = "docno";
        public static final String FIELD_DATE = "date";
    
        private IndexWriter writer;
        private ArrayList<File> queue = new ArrayList<File>();
    
        @SuppressWarnings("static-access")
        public static void main(String[] args) throws IOException, ParseException {
    
            String s = null;
            Scanner sc = new Scanner(System.in);
            //Menu selections
            int choice;
            do {
                System.out.println("Welcome Search Engine. Please choose" + " your selections below\n" + "(1) Build index collection\n" + "(2) Search for the documents ids\n" + "(3) Exit\n");
    
                System.out.print("Enter your choice: ");
                choice = sc.nextInt();
    
                //switch statements
                switch (choice) {
    
                    case 1:
    
                        System.out.println("Enter the path where the index will be created: ");
    
                        BufferedReader br = new BufferedReader(
                                new InputStreamReader(System.in));
                        s = br.readLine();
    
                        TextFileIndexer indexer = null;
                        try {
                            indexer = new TextFileIndexer(s);
                        } catch (Exception ex) {
                            System.out.println("Cannot create index..." + ex.getMessage());
                            System.exit(-1);
                        }
                        try {
                            System.out.println("Enter the file or folder name to add into the index (q=quit):");
                            System.out.println("[Acceptable file types: .xml, .html, .html, .txt]");
                            s = br.readLine();
                            //try to add file into the index
                            indexer.indexFileOrDirectory(s);
                        } catch (Exception e) {
                            System.out.println("Error indexing " + s + " : " + e.getMessage());
                        }
                        indexer.closeIndex();
    
                        break;
    
                    case 2:
    
                        System.out.println("your query?");
                        br = new BufferedReader(
                                new InputStreamReader(System.in));
                        String a = br.readLine();
                        
                       searchIndex(a);
                        break;
                    case 3:
    
                        //exit the program
                        System.out.println("Program exiting..");
                        break;
    
                    default:
    
                        //display invalid selection
                        System.err.println("Err: Invalid selection");
                }
            } while (choice != 3);
    
        }
    
    
        public TextFileIndexer(String index) throws IOException, ParseException {
            // the boolean true parameter means to create a new index everytime,
            // potentially overwriting any existing files there.
            writer = new IndexWriter(index, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
        }
    
        public static void searchIndex(String words) throws IOException, ParseException{
    
            System.out.println("Searching for '" + words + "'"  );
            [B]Directory directory = FSDirectory.getDirectory(INDEX_COLLECTION);[/B]
            IndexReader indexReader = IndexReader.open(directory);
            IndexSearcher indexSearcher = new IndexSearcher(indexReader);
    
            Analyzer analyzer = new StandardAnalyzer();
            QueryParser queryParser = new QueryParser(FIELD_ID, analyzer);
            Query query = queryParser.parse(words);
            Hits hits = indexSearcher.search(query);
            System.out.println("Number of hits: " + hits.length());
    
            Iterator<Hit> it = hits.iterator();
            while (it.hasNext()) {
                Hit hit = it.next();
                Document document = hit.getDocument();
                String text = document.get(FIELD_ID);
                System.out.println("Hit: " + text);
            }
    
        }
    
        public void indexFileOrDirectory(String fileName) throws IOException {
    
            listFiles(new File(fileName));
    
            int originalNumDocs = writer.numDocs();
            for (File file : queue) {
                FileReader fr = null;
                try {
                    Document doc = new Document();
                    fr = new FileReader(file);
                    doc.add(new Field(FIELD_CONTENTS, fr));
    
                    String path = file.getCanonicalPath();
                    doc.add(new Field(FIELD_PATH, path, Field.Store.YES, Field.Index.UN_TOKENIZED));
    
                    String docno = file.getName();
                    doc.add(new Field(FIELD_ID, docno, Field.Store.YES, Field.Index.UN_TOKENIZED));
    
                    String date = file.getPath();
                    doc.add(new Field(FIELD_DATE, date, Field.Store.YES, Field.Index.UN_TOKENIZED));
    
                    writer.addDocument(doc);
                    System.out.println("Added: " + file);
    
    
                } catch (Exception e) {
                    System.out.println("Could not add: " + file);
                } finally {
                    fr.close();
                }
            }
    
            int newNumDocs = writer.numDocs();
            System.out.println("");
            System.out.println("************************");
            System.out.println((newNumDocs - originalNumDocs) + " documents added.");
            System.out.println("************************");
    
            queue.clear();
        }
    
        private void listFiles(File file) {
            if (!file.exists()) {
                System.out.println(file + " does not exist.");
            }
            if (file.isDirectory()) {
                for (File f : file.listFiles()) {
                    listFiles(f);
                }
            } else {
                String filename = file.getName().toLowerCase();
              
                if (filename.endsWith(".htm") || filename.endsWith(".html") ||
                        filename.endsWith(".xml") || filename.endsWith(".txt")) {
                    queue.add(file);
                } else {
                    System.out.println("Skipped " + filename);
                }
            }
        }
    
        public void closeIndex() throws IOException {
            writer.optimize();
            writer.close();
        }
    
         
    }
    hi guys,

    the problem is in bold. what i would like to have is to get the directory of my files.thanks. but i unable to do so.

  2. #2
    Petr's Avatar
    Petr is offline Senior Member
    Join Date
    Jan 2011
    Location
    Russia
    Posts
    620
    Rep Power
    10

    Default

    Sorry You are very difficult understand what you want.
    Do you want to index a special folder?
    Skype: petrarsentev
    http://TrackStudio.com

  3. #3
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    hi,

    yes.its a special folder which consist of "tdt3/19982001/(and the text files). how should i be able to get the directory of this ?thanks.i am able to build the index. but when i try to do a search index, i dont know how to get the directory. i try manually but its said "access is denied" thanks.

  4. #4
    Petr's Avatar
    Petr is offline Senior Member
    Join Date
    Jan 2011
    Location
    Russia
    Posts
    620
    Rep Power
    10

    Default

    see
    Java Code:
    package com.action;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.queryParser.QueryParser;
    import org.apache.lucene.search.*;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    
    import java.io.File;
    import java.io.FileReader;
    import java.io.IOException;
    import java.io.Reader;
    
    public class LuceneHtml {
    
    	public static final String FILES_TO_INDEX_DIRECTORY = "filesToIndex";
    	public static final String INDEX_DIRECTORY = "indexDirectory";
    
    	public static final String FIELD_PATH = "path";
    	public static final String FIELD_CONTENTS = "contents";
    
    	public static void main(String[] args) throws Exception {
    		createIndex();
    		searchIndex("msg");
    	}
    
    	public static void createIndex() throws IOException {
    		Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
    		File index = new File(INDEX_DIRECTORY);
    		IndexWriter indexWriter = new IndexWriter(FSDirectory.open(index), analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
    		File dir = new File(FILES_TO_INDEX_DIRECTORY);
    		File[] files = dir.listFiles();
    		for (File file : files) {
    			Document document = new Document();
    			String path = file.getCanonicalPath();
    			document.add(new Field(FIELD_PATH, path, Field.Store.YES, Field.Index.ANALYZED));
    			Reader reader = new FileReader(file);
    			document.add(new Field(FIELD_CONTENTS, reader));
    			indexWriter.addDocument(document);
    		}
    		indexWriter.optimize();
    		indexWriter.close();
    	}
    
    	public static void searchIndex(String searchString) throws Exception {
    		System.out.println("Searching for '" + searchString + "'");
            TopDocsCollector collector = TopFieldCollector.create(new Sort(new SortField(null, SortField.DOC, true)), 100, false, false, false, false);
    		Directory directory = FSDirectory.open(new File(INDEX_DIRECTORY));
    		IndexReader indexReader = IndexReader.open(directory);
    		IndexSearcher indexSearcher = new IndexSearcher(indexReader);
    
    		Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
    		QueryParser queryParser = new QueryParser(Version.LUCENE_30, FIELD_CONTENTS, analyzer);
    		Query query = queryParser.parse(searchString);
            indexSearcher.search(query, collector);
    		ScoreDoc[] hits = collector.topDocs().scoreDocs;
    		System.out.println("Number of hits: " + hits.length);
    
    		for (ScoreDoc hit : hits) {
    			Document document = indexSearcher.doc(hit.doc);
    			String path = document.get(FIELD_PATH);
    			System.out.println("Hit: " + path);
    		}
    	}
    }
    Skype: petrarsentev
    http://TrackStudio.com

  5. #5
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    hi,

    may i know what lucene 3.0 is?thanks.

  6. #6
    Petr's Avatar
    Petr is offline Senior Member
    Join Date
    Jan 2011
    Location
    Russia
    Posts
    620
    Rep Power
    10

    Default

    I use last version Lucune 3.0.3.
    see Version (Lucene 3.0.1 API)
    What version do you use?
    Skype: petrarsentev
    http://TrackStudio.com

  7. #7
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    hi,

    tried your codes but i still have access denied. This tdt3 has many sub directories inside

    tdt3/1998/1.txt, tdt3/1999/1.txt (contain many date files)

  8. #8
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    hi,

    the error : Exception in thread "main" java.io.FileNotFoundException: tdt3\19981001 (Access is denied)

  9. #9
    Petr's Avatar
    Petr is offline Senior Member
    Join Date
    Jan 2011
    Location
    Russia
    Posts
    620
    Rep Power
    10

    Default

    Can you show a full structure your project?
    This is structure my project
    Java Code:
    .
    |-- filesToIndex
    |   `-- overview.html
    |-- indexDirectory
    |   |-- _0.cfs
    |   |-- _0.cfx
    |   |-- segments_2
    |   `-- segments.gen
    |-- lib
    |   |-- lucene-core-3.0.3.jar
    |   `-- lucene-highlighter-3.0.3.jar
    |-- LuceneHtml.iml
    |-- out
    |   |-- production
    |   |   `-- LuceneHtml
    |   |       `-- com
    |   |           `-- action
    |   |               `-- LuceneHtml.class
    |   `-- test
    |       `-- LuceneHtml
    `-- src
        `-- com
            `-- action
                `-- LuceneHtml.java
    Last edited by Petr; 03-16-2011 at 05:57 PM.
    Skype: petrarsentev
    http://TrackStudio.com

  10. #10
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    hi,

    hmm..is you dont mind, i have inserted a link of pictures to show the structures of the document collection. thanks.[IMG]1.jpeg[/IMG] [IMG]2.jpeg[/IMG]

  11. #11
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

  12. #12
    Petr's Avatar
    Petr is offline Senior Member
    Join Date
    Jan 2011
    Location
    Russia
    Posts
    620
    Rep Power
    10

    Default

    Ok. I looked your pictures. So you just can use recursive for reading all structure index folders. It's easy. if you want I would to write this code for you.
    Skype: petrarsentev
    http://TrackStudio.com

  13. #13
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    hi,

    how should i go about doin the recursive?and able to add in the files as in the code

    Directory directory = FSDirectory.getDirectory(INDEX_COLLECTION); or something that makes me able to do a search index. i know the building part is right, just cant connect it to the search index without passing parameter. cant think of any other way. thanks

  14. #14
    Petr's Avatar
    Petr is offline Senior Member
    Join Date
    Jan 2011
    Location
    Russia
    Posts
    620
    Rep Power
    10

    Default

    Yeah I looked your code. You used a recursive. so you have a collection of files for index.
    then you index each file. That is all.
    for example
    Java Code:
    for (File file : files) {
        createIndex(file);
    }
    Java Code:
    public static void createIndex(File index) throws IOException {
    		Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
    		IndexWriter indexWriter = new IndexWriter(FSDirectory.open(index), analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
    		File dir = new File(FILES_TO_INDEX_DIRECTORY);
    		File[] files = dir.listFiles();
    		for (File file : files) {
    			Document document = new Document();
    			String path = file.getCanonicalPath();
    			document.add(new Field(FIELD_PATH, path, Field.Store.YES, Field.Index.ANALYZED));
    			Reader reader = new FileReader(file);
    			document.add(new Field(FIELD_CONTENTS, reader));
    			indexWriter.addDocument(document);
    		}
    		indexWriter.optimize();
    		indexWriter.close();
    	}
    Skype: petrarsentev
    http://TrackStudio.com

  15. #15
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    hi,

    Java Code:
    for (File file : files) {
        createIndex(file);
    }
    i only have this for loop at the createIndex() method only. Do you mean i supposed to put it inside?

  16. #16
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    hi,

    what argument should i put at the createIndex(..) at the main method?thanks.

  17. #17
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    hi,

    Java Code:
    public static void createIndex() throws CorruptIndexException, LockObtainFailedException, IOException {
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_24);
            IndexWriter indexWriter = new IndexWriter(INDEX_COLLECTION, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
            File dir = new File(FILES_TO_INDEX_DIRECTORY);
           
            FilenameFilter select = new FileListFilter("txt");
            File[] folder = dir.listFiles();
            //File[] files = dirsub;
            for (int i = 0; i< folder.length;i++) {
                folder[i].getName();
                File[] sub = folder[i].listFiles();
                for(File subfiles: sub) {
               
               Document document = new Document();
    
                String path = subfiles.getCanonicalPath();
                document.add(new Field(FIELD_PATH, path, Field.Store.YES, Field.Index.UN_TOKENIZED));
    
                String docno = subfiles.getName();
                document.add(new Field(FIELD_ID, docno, Field.Store.YES, Field.Index.UN_TOKENIZED));
    
                String date = subfiles.getPath();
                document.add(new Field(FIELD_DATE, date, Field.Store.YES, Field.Index.UN_TOKENIZED));
    
                Reader reader = new FileReader(subfiles);
                document.add(new Field(FIELD_CONTENTS, reader));
    
                indexWriter.addDocument(document);
                }
            }
            indexWriter.optimize();
            indexWriter.close();
        
        }
    the lucene running and not doin anything

    and my main method

    Java Code:
    public static void main(String[] args) throws Exception {
    
    
            createIndex();
            System.out.println("search");
            BufferedReader br = new BufferedReader(
                    new InputStreamReader(System.in));
            String s = br.readLine();
            searchIndex(s);
    
        }

  18. #18
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    I cant get the query. this is my codes. somehow it returned empty hits.

    Java Code:
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileReader;
    import java.io.FilenameFilter;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.io.Reader;
    import java.util.Iterator;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.index.CorruptIndexException;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.queryParser.ParseException;
    import org.apache.lucene.queryParser.QueryParser;
    import org.apache.lucene.search.Hit;
    import org.apache.lucene.search.Hits;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.store.LockObtainFailedException;
    import org.apache.lucene.util.Version;
    
    public class SearchEngine {
    
        //declare all the fields
        public static final String FILES_TO_INDEX_DIRECTORY = "tdt3/";
        public static final String INDEX_COLLECTION = "indexDirectory";
        public static final String FIELD_PATH = "path";
        public static final String FIELD_CONTENTS = "contents";
        public static final String FIELD_ID = "docno";
        public static final String FIELD_DATE = "date";
    
        public static void main(String[] args) throws Exception {
    
    
            //createIndex();
            System.out.println("search");
            BufferedReader br = new BufferedReader(
                    new InputStreamReader(System.in));
            String s = br.readLine();
            searchIndex(s);
    
        }
    
        public static void createIndex() throws CorruptIndexException, LockObtainFailedException, IOException {
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_24);
            IndexWriter indexWriter = new IndexWriter(INDEX_COLLECTION, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
            File dir = new File(FILES_TO_INDEX_DIRECTORY);
    
            FilenameFilter select = new FileListFilter("txt");
            File[] folder = dir.listFiles();
    
            for (int i = 0; i < folder.length; i++) {
                File[] sub = folder[i].listFiles(select);
                for (File subfiles : sub) {
    
    
                    Document document = new Document();
    
                    String path = subfiles.getPath();
                    document.add(new Field(FIELD_PATH, path, Field.Store.YES, Field.Index.UN_TOKENIZED));
    
                    String docno = subfiles.getName();
                    document.add(new Field(FIELD_ID, docno, Field.Store.YES, Field.Index.UN_TOKENIZED));
    
                    String date = subfiles.getPath();
                    document.add(new Field(FIELD_DATE, date, Field.Store.YES, Field.Index.UN_TOKENIZED));
    
                    BufferedReader reader = new BufferedReader(new FileReader(subfiles));
                    document.add(new Field(FIELD_CONTENTS, reader));
                    System.out.println("added" + subfiles);
    
                    indexWriter.addDocument(document);
                }
            }
            indexWriter.optimize();
            indexWriter.close();
    
        }
    
        public static void searchIndex(String words) throws IOException, ParseException {
            System.out.println("Searching for '" + words + "'");
            FSDirectory.open(new File("tdt3/"));
            //IndexReader indexReader = IndexReader.open(directory);
            IndexSearcher indexSearcher = new IndexSearcher(INDEX_COLLECTION);
    
            Analyzer analyzer = new StandardAnalyzer();
            QueryParser queryParser = new QueryParser(FIELD_PATH, analyzer);
            Query query = queryParser.parse(words);
            Hits hits = indexSearcher.search(query);
            System.out.println("Number of hits: " + hits.length());
    
            Iterator<Hit> it = hits.iterator();
            while (it.hasNext()) {
                Hit hit = it.next();
                Document document = hit.getDocument();
                String text = document.get(FIELD_PATH);
                System.out.println("Hit: " + text);
            }
    
        }
    
        public static class FileListFilter implements FilenameFilter {
    
            private String extension;
    
            public FileListFilter(String extension) {
    
                this.extension = extension;
            }
    
            public boolean accept(File directory, String filename) {
                boolean fileOK = true;
    
    
                if (extension != null) {
                    fileOK &= filename.endsWith('.' + extension);
                }
                return fileOK;
            }
        }
    }
    thanks for the feedback, i am still trying to spot where i did wrong.

  19. #19
    Petr's Avatar
    Petr is offline Senior Member
    Join Date
    Jan 2011
    Location
    Russia
    Posts
    620
    Rep Power
    10

    Default

    I advise you to figure out how to make index for one file. Then you understand how you can index all files in index folder.
    Skype: petrarsentev
    http://TrackStudio.com

  20. #20
    drogba123 is offline Member
    Join Date
    Mar 2011
    Posts
    29
    Rep Power
    0

    Default

    hi petr,

    i have tested with one file only and it worked. and the problem yesterday regarding reading subfolder also has been solved. i am using nested loop to read the folder and its subfolder.

    but now i am trying to search the index but the number of hits returned 0.

Page 1 of 2 12 LastLast

Similar Threads

  1. Lucene beginner. HELP!!!
    By adbawany in forum Lucene
    Replies: 2
    Last Post: 01-22-2011, 12:18 PM
  2. Monitoring lucene
    By arumilli in forum Lucene
    Replies: 0
    Last Post: 08-05-2010, 06:40 PM
  3. Indexing XML using lucene
    By peliukasss in forum Lucene
    Replies: 0
    Last Post: 03-28-2010, 11:20 PM
  4. Apache Lucene
    By dkarthiin in forum Lucene
    Replies: 0
    Last Post: 03-25-2010, 01:25 PM
  5. Apache Lucene 2.3.2
    By Java Tip in forum Java Software
    Replies: 0
    Last Post: 05-08-2008, 07:49 PM

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •