Results 1 to 20 of 37
Thread: Lucene issue
- 03-16-2011, 04:26 PM #1
Member
- Join Date
- Mar 2011
- Posts
- 29
- Rep Power
- 0
Lucene issue
hi guys,Java Code:import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Hits; import org.apache.lucene.index.*; import java.io.*; import java.util.ArrayList; import java.util.Iterator; import java.util.Scanner; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Hit; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory; /** * This terminal application creates an Apache Lucene index in a folder and adds files into this index * based on the input of the user. */ public class TextFileIndexer { //declare all the fields public static final String FIELD_PATH = "path"; public static final String FIELD_CONTENTS = "contents"; public static final String FIELD_ID = "docno"; public static final String FIELD_DATE = "date"; private IndexWriter writer; private ArrayList<File> queue = new ArrayList<File>(); @SuppressWarnings("static-access") public static void main(String[] args) throws IOException, ParseException { String s = null; Scanner sc = new Scanner(System.in); //Menu selections int choice; do { System.out.println("Welcome Search Engine. Please choose" + " your selections below\n" + "(1) Build index collection\n" + "(2) Search for the documents ids\n" + "(3) Exit\n"); System.out.print("Enter your choice: "); choice = sc.nextInt(); //switch statements switch (choice) { case 1: System.out.println("Enter the path where the index will be created: "); BufferedReader br = new BufferedReader( new InputStreamReader(System.in)); s = br.readLine(); TextFileIndexer indexer = null; try { indexer = new TextFileIndexer(s); } catch (Exception ex) { System.out.println("Cannot create index..." + ex.getMessage()); System.exit(-1); } try { System.out.println("Enter the file or folder name to add into the index (q=quit):"); System.out.println("[Acceptable file types: .xml, .html, .html, .txt]"); s = br.readLine(); //try to add file into the index indexer.indexFileOrDirectory(s); } catch (Exception e) { System.out.println("Error indexing " + s + " : " + e.getMessage()); } indexer.closeIndex(); break; case 2: System.out.println("your query?"); br = new BufferedReader( new InputStreamReader(System.in)); String a = br.readLine(); searchIndex(a); break; case 3: //exit the program System.out.println("Program exiting.."); break; default: //display invalid selection System.err.println("Err: Invalid selection"); } } while (choice != 3); } public TextFileIndexer(String index) throws IOException, ParseException { // the boolean true parameter means to create a new index everytime, // potentially overwriting any existing files there. writer = new IndexWriter(index, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); } public static void searchIndex(String words) throws IOException, ParseException{ System.out.println("Searching for '" + words + "'" ); [B]Directory directory = FSDirectory.getDirectory(INDEX_COLLECTION);[/B] IndexReader indexReader = IndexReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(indexReader); Analyzer analyzer = new StandardAnalyzer(); QueryParser queryParser = new QueryParser(FIELD_ID, analyzer); Query query = queryParser.parse(words); Hits hits = indexSearcher.search(query); System.out.println("Number of hits: " + hits.length()); Iterator<Hit> it = hits.iterator(); while (it.hasNext()) { Hit hit = it.next(); Document document = hit.getDocument(); String text = document.get(FIELD_ID); System.out.println("Hit: " + text); } } public void indexFileOrDirectory(String fileName) throws IOException { listFiles(new File(fileName)); int originalNumDocs = writer.numDocs(); for (File file : queue) { FileReader fr = null; try { Document doc = new Document(); fr = new FileReader(file); doc.add(new Field(FIELD_CONTENTS, fr)); String path = file.getCanonicalPath(); doc.add(new Field(FIELD_PATH, path, Field.Store.YES, Field.Index.UN_TOKENIZED)); String docno = file.getName(); doc.add(new Field(FIELD_ID, docno, Field.Store.YES, Field.Index.UN_TOKENIZED)); String date = file.getPath(); doc.add(new Field(FIELD_DATE, date, Field.Store.YES, Field.Index.UN_TOKENIZED)); writer.addDocument(doc); System.out.println("Added: " + file); } catch (Exception e) { System.out.println("Could not add: " + file); } finally { fr.close(); } } int newNumDocs = writer.numDocs(); System.out.println(""); System.out.println("************************"); System.out.println((newNumDocs - originalNumDocs) + " documents added."); System.out.println("************************"); queue.clear(); } private void listFiles(File file) { if (!file.exists()) { System.out.println(file + " does not exist."); } if (file.isDirectory()) { for (File f : file.listFiles()) { listFiles(f); } } else { String filename = file.getName().toLowerCase(); if (filename.endsWith(".htm") || filename.endsWith(".html") || filename.endsWith(".xml") || filename.endsWith(".txt")) { queue.add(file); } else { System.out.println("Skipped " + filename); } } } public void closeIndex() throws IOException { writer.optimize(); writer.close(); } }
the problem is in bold. what i would like to have is to get the directory of my files.thanks. but i unable to do so.
- 03-16-2011, 04:36 PM #2
Sorry You are very difficult understand what you want.
Do you want to index a special folder?Skype: petrarsentev
http://TrackStudio.com
- 03-16-2011, 04:38 PM #3
Member
- Join Date
- Mar 2011
- Posts
- 29
- Rep Power
- 0
hi,
yes.its a special folder which consist of "tdt3/19982001/(and the text files). how should i be able to get the directory of this ?thanks.i am able to build the index. but when i try to do a search index, i dont know how to get the directory. i try manually but its said "access is denied" thanks.
- 03-16-2011, 04:45 PM #4
see
Java Code:package com.action; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.*; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.Reader; public class LuceneHtml { public static final String FILES_TO_INDEX_DIRECTORY = "filesToIndex"; public static final String INDEX_DIRECTORY = "indexDirectory"; public static final String FIELD_PATH = "path"; public static final String FIELD_CONTENTS = "contents"; public static void main(String[] args) throws Exception { createIndex(); searchIndex("msg"); } public static void createIndex() throws IOException { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); File index = new File(INDEX_DIRECTORY); IndexWriter indexWriter = new IndexWriter(FSDirectory.open(index), analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); File dir = new File(FILES_TO_INDEX_DIRECTORY); File[] files = dir.listFiles(); for (File file : files) { Document document = new Document(); String path = file.getCanonicalPath(); document.add(new Field(FIELD_PATH, path, Field.Store.YES, Field.Index.ANALYZED)); Reader reader = new FileReader(file); document.add(new Field(FIELD_CONTENTS, reader)); indexWriter.addDocument(document); } indexWriter.optimize(); indexWriter.close(); } public static void searchIndex(String searchString) throws Exception { System.out.println("Searching for '" + searchString + "'"); TopDocsCollector collector = TopFieldCollector.create(new Sort(new SortField(null, SortField.DOC, true)), 100, false, false, false, false); Directory directory = FSDirectory.open(new File(INDEX_DIRECTORY)); IndexReader indexReader = IndexReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(indexReader); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); QueryParser queryParser = new QueryParser(Version.LUCENE_30, FIELD_CONTENTS, analyzer); Query query = queryParser.parse(searchString); indexSearcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; System.out.println("Number of hits: " + hits.length); for (ScoreDoc hit : hits) { Document document = indexSearcher.doc(hit.doc); String path = document.get(FIELD_PATH); System.out.println("Hit: " + path); } } }Skype: petrarsentev
http://TrackStudio.com
- 03-16-2011, 04:50 PM #5
Member
- Join Date
- Mar 2011
- Posts
- 29
- Rep Power
- 0
hi,
may i know what lucene 3.0 is?thanks.
- 03-16-2011, 04:52 PM #6
I use last version Lucune 3.0.3.
see Version (Lucene 3.0.1 API)
What version do you use?Skype: petrarsentev
http://TrackStudio.com
- 03-16-2011, 04:52 PM #7
Member
- Join Date
- Mar 2011
- Posts
- 29
- Rep Power
- 0
hi,
tried your codes but i still have access denied. This tdt3 has many sub directories inside
tdt3/1998/1.txt, tdt3/1999/1.txt (contain many date files)
- 03-16-2011, 04:54 PM #8
Member
- Join Date
- Mar 2011
- Posts
- 29
- Rep Power
- 0
hi,
the error : Exception in thread "main" java.io.FileNotFoundException: tdt3\19981001 (Access is denied)
- 03-16-2011, 04:55 PM #9
Can you show a full structure your project?
This is structure my project
Java Code:. |-- filesToIndex | `-- overview.html |-- indexDirectory | |-- _0.cfs | |-- _0.cfx | |-- segments_2 | `-- segments.gen |-- lib | |-- lucene-core-3.0.3.jar | `-- lucene-highlighter-3.0.3.jar |-- LuceneHtml.iml |-- out | |-- production | | `-- LuceneHtml | | `-- com | | `-- action | | `-- LuceneHtml.class | `-- test | `-- LuceneHtml `-- src `-- com `-- action `-- LuceneHtml.javaLast edited by Petr; 03-16-2011 at 04:57 PM.
Skype: petrarsentev
http://TrackStudio.com
- 03-16-2011, 04:59 PM #10
Member
- Join Date
- Mar 2011
- Posts
- 29
- Rep Power
- 0
hi,
hmm..is you dont mind, i have inserted a link of pictures to show the structures of the document collection. thanks.[IMG]1.jpeg[/IMG] [IMG]2.jpeg[/IMG]
- 03-16-2011, 05:02 PM #11
Member
- Join Date
- Mar 2011
- Posts
- 29
- Rep Power
- 0
- 03-16-2011, 05:14 PM #12
Ok. I looked your pictures. So you just can use recursive for reading all structure index folders. It's easy. if you want I would to write this code for you.
Skype: petrarsentev
http://TrackStudio.com
- 03-16-2011, 05:16 PM #13
Member
- Join Date
- Mar 2011
- Posts
- 29
- Rep Power
- 0
hi,
how should i go about doin the recursive?and able to add in the files as in the code
Directory directory = FSDirectory.getDirectory(INDEX_COLLECTION); or something that makes me able to do a search index. i know the building part is right, just cant connect it to the search index without passing parameter. cant think of any other way. thanks
- 03-16-2011, 05:46 PM #14
Yeah I looked your code. You used a recursive. so you have a collection of files for index.
then you index each file. That is all.
for example
Java Code:for (File file : files) { createIndex(file); }Java Code:public static void createIndex(File index) throws IOException { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); IndexWriter indexWriter = new IndexWriter(FSDirectory.open(index), analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); File dir = new File(FILES_TO_INDEX_DIRECTORY); File[] files = dir.listFiles(); for (File file : files) { Document document = new Document(); String path = file.getCanonicalPath(); document.add(new Field(FIELD_PATH, path, Field.Store.YES, Field.Index.ANALYZED)); Reader reader = new FileReader(file); document.add(new Field(FIELD_CONTENTS, reader)); indexWriter.addDocument(document); } indexWriter.optimize(); indexWriter.close(); }Skype: petrarsentev
http://TrackStudio.com
- 03-16-2011, 05:53 PM #15
Member
- Join Date
- Mar 2011
- Posts
- 29
- Rep Power
- 0
hi,
i only have this for loop at the createIndex() method only. Do you mean i supposed to put it inside?Java Code:for (File file : files) { createIndex(file); }
- 03-16-2011, 06:04 PM #16
Member
- Join Date
- Mar 2011
- Posts
- 29
- Rep Power
- 0
hi,
what argument should i put at the createIndex(..) at the main method?thanks.
- 03-17-2011, 05:39 AM #17
Member
- Join Date
- Mar 2011
- Posts
- 29
- Rep Power
- 0
hi,
the lucene running and not doin anythingJava Code:public static void createIndex() throws CorruptIndexException, LockObtainFailedException, IOException { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_24); IndexWriter indexWriter = new IndexWriter(INDEX_COLLECTION, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); File dir = new File(FILES_TO_INDEX_DIRECTORY); FilenameFilter select = new FileListFilter("txt"); File[] folder = dir.listFiles(); //File[] files = dirsub; for (int i = 0; i< folder.length;i++) { folder[i].getName(); File[] sub = folder[i].listFiles(); for(File subfiles: sub) { Document document = new Document(); String path = subfiles.getCanonicalPath(); document.add(new Field(FIELD_PATH, path, Field.Store.YES, Field.Index.UN_TOKENIZED)); String docno = subfiles.getName(); document.add(new Field(FIELD_ID, docno, Field.Store.YES, Field.Index.UN_TOKENIZED)); String date = subfiles.getPath(); document.add(new Field(FIELD_DATE, date, Field.Store.YES, Field.Index.UN_TOKENIZED)); Reader reader = new FileReader(subfiles); document.add(new Field(FIELD_CONTENTS, reader)); indexWriter.addDocument(document); } } indexWriter.optimize(); indexWriter.close(); }
and my main method
Java Code:public static void main(String[] args) throws Exception { createIndex(); System.out.println("search"); BufferedReader br = new BufferedReader( new InputStreamReader(System.in)); String s = br.readLine(); searchIndex(s); }
- 03-17-2011, 07:57 AM #18
Member
- Join Date
- Mar 2011
- Posts
- 29
- Rep Power
- 0
I cant get the query. this is my codes. somehow it returned empty hits.
thanks for the feedback, i am still trying to spot where i did wrong.Java Code:import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FilenameFilter; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.util.Iterator; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Hit; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; public class SearchEngine { //declare all the fields public static final String FILES_TO_INDEX_DIRECTORY = "tdt3/"; public static final String INDEX_COLLECTION = "indexDirectory"; public static final String FIELD_PATH = "path"; public static final String FIELD_CONTENTS = "contents"; public static final String FIELD_ID = "docno"; public static final String FIELD_DATE = "date"; public static void main(String[] args) throws Exception { //createIndex(); System.out.println("search"); BufferedReader br = new BufferedReader( new InputStreamReader(System.in)); String s = br.readLine(); searchIndex(s); } public static void createIndex() throws CorruptIndexException, LockObtainFailedException, IOException { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_24); IndexWriter indexWriter = new IndexWriter(INDEX_COLLECTION, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); File dir = new File(FILES_TO_INDEX_DIRECTORY); FilenameFilter select = new FileListFilter("txt"); File[] folder = dir.listFiles(); for (int i = 0; i < folder.length; i++) { File[] sub = folder[i].listFiles(select); for (File subfiles : sub) { Document document = new Document(); String path = subfiles.getPath(); document.add(new Field(FIELD_PATH, path, Field.Store.YES, Field.Index.UN_TOKENIZED)); String docno = subfiles.getName(); document.add(new Field(FIELD_ID, docno, Field.Store.YES, Field.Index.UN_TOKENIZED)); String date = subfiles.getPath(); document.add(new Field(FIELD_DATE, date, Field.Store.YES, Field.Index.UN_TOKENIZED)); BufferedReader reader = new BufferedReader(new FileReader(subfiles)); document.add(new Field(FIELD_CONTENTS, reader)); System.out.println("added" + subfiles); indexWriter.addDocument(document); } } indexWriter.optimize(); indexWriter.close(); } public static void searchIndex(String words) throws IOException, ParseException { System.out.println("Searching for '" + words + "'"); FSDirectory.open(new File("tdt3/")); //IndexReader indexReader = IndexReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(INDEX_COLLECTION); Analyzer analyzer = new StandardAnalyzer(); QueryParser queryParser = new QueryParser(FIELD_PATH, analyzer); Query query = queryParser.parse(words); Hits hits = indexSearcher.search(query); System.out.println("Number of hits: " + hits.length()); Iterator<Hit> it = hits.iterator(); while (it.hasNext()) { Hit hit = it.next(); Document document = hit.getDocument(); String text = document.get(FIELD_PATH); System.out.println("Hit: " + text); } } public static class FileListFilter implements FilenameFilter { private String extension; public FileListFilter(String extension) { this.extension = extension; } public boolean accept(File directory, String filename) { boolean fileOK = true; if (extension != null) { fileOK &= filename.endsWith('.' + extension); } return fileOK; } } }
- 03-17-2011, 08:18 AM #19
I advise you to figure out how to make index for one file. Then you understand how you can index all files in index folder.
Skype: petrarsentev
http://TrackStudio.com
- 03-17-2011, 08:21 AM #20
Member
- Join Date
- Mar 2011
- Posts
- 29
- Rep Power
- 0
Similar Threads
-
Lucene beginner. HELP!!!
By adbawany in forum LuceneReplies: 2Last Post: 01-22-2011, 11:18 AM -
Monitoring lucene
By arumilli in forum LuceneReplies: 0Last Post: 08-05-2010, 05:40 PM -
Indexing XML using lucene
By peliukasss in forum LuceneReplies: 0Last Post: 03-28-2010, 10:20 PM -
Apache Lucene
By dkarthiin in forum LuceneReplies: 0Last Post: 03-25-2010, 12:25 PM -
Apache Lucene 2.3.2
By Java Tip in forum Java SoftwareReplies: 0Last Post: 05-08-2008, 06:49 PM


LinkBack URL
About LinkBacks
Reply With Quote
Bookmarks