-
Lucene issue
Code:
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.index.*;
import java.io.*;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Scanner;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Hit;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
/**
* This terminal application creates an Apache Lucene index in a folder and adds files into this index
* based on the input of the user.
*/
public class TextFileIndexer {
//declare all the fields
public static final String FIELD_PATH = "path";
public static final String FIELD_CONTENTS = "contents";
public static final String FIELD_ID = "docno";
public static final String FIELD_DATE = "date";
private IndexWriter writer;
private ArrayList<File> queue = new ArrayList<File>();
@SuppressWarnings("static-access")
public static void main(String[] args) throws IOException, ParseException {
String s = null;
Scanner sc = new Scanner(System.in);
//Menu selections
int choice;
do {
System.out.println("Welcome Search Engine. Please choose" + " your selections below\n" + "(1) Build index collection\n" + "(2) Search for the documents ids\n" + "(3) Exit\n");
System.out.print("Enter your choice: ");
choice = sc.nextInt();
//switch statements
switch (choice) {
case 1:
System.out.println("Enter the path where the index will be created: ");
BufferedReader br = new BufferedReader(
new InputStreamReader(System.in));
s = br.readLine();
TextFileIndexer indexer = null;
try {
indexer = new TextFileIndexer(s);
} catch (Exception ex) {
System.out.println("Cannot create index..." + ex.getMessage());
System.exit(-1);
}
try {
System.out.println("Enter the file or folder name to add into the index (q=quit):");
System.out.println("[Acceptable file types: .xml, .html, .html, .txt]");
s = br.readLine();
//try to add file into the index
indexer.indexFileOrDirectory(s);
} catch (Exception e) {
System.out.println("Error indexing " + s + " : " + e.getMessage());
}
indexer.closeIndex();
break;
case 2:
System.out.println("your query?");
br = new BufferedReader(
new InputStreamReader(System.in));
String a = br.readLine();
searchIndex(a);
break;
case 3:
//exit the program
System.out.println("Program exiting..");
break;
default:
//display invalid selection
System.err.println("Err: Invalid selection");
}
} while (choice != 3);
}
public TextFileIndexer(String index) throws IOException, ParseException {
// the boolean true parameter means to create a new index everytime,
// potentially overwriting any existing files there.
writer = new IndexWriter(index, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
}
public static void searchIndex(String words) throws IOException, ParseException{
System.out.println("Searching for '" + words + "'" );
[B]Directory directory = FSDirectory.getDirectory(INDEX_COLLECTION);[/B]
IndexReader indexReader = IndexReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
Analyzer analyzer = new StandardAnalyzer();
QueryParser queryParser = new QueryParser(FIELD_ID, analyzer);
Query query = queryParser.parse(words);
Hits hits = indexSearcher.search(query);
System.out.println("Number of hits: " + hits.length());
Iterator<Hit> it = hits.iterator();
while (it.hasNext()) {
Hit hit = it.next();
Document document = hit.getDocument();
String text = document.get(FIELD_ID);
System.out.println("Hit: " + text);
}
}
public void indexFileOrDirectory(String fileName) throws IOException {
listFiles(new File(fileName));
int originalNumDocs = writer.numDocs();
for (File file : queue) {
FileReader fr = null;
try {
Document doc = new Document();
fr = new FileReader(file);
doc.add(new Field(FIELD_CONTENTS, fr));
String path = file.getCanonicalPath();
doc.add(new Field(FIELD_PATH, path, Field.Store.YES, Field.Index.UN_TOKENIZED));
String docno = file.getName();
doc.add(new Field(FIELD_ID, docno, Field.Store.YES, Field.Index.UN_TOKENIZED));
String date = file.getPath();
doc.add(new Field(FIELD_DATE, date, Field.Store.YES, Field.Index.UN_TOKENIZED));
writer.addDocument(doc);
System.out.println("Added: " + file);
} catch (Exception e) {
System.out.println("Could not add: " + file);
} finally {
fr.close();
}
}
int newNumDocs = writer.numDocs();
System.out.println("");
System.out.println("************************");
System.out.println((newNumDocs - originalNumDocs) + " documents added.");
System.out.println("************************");
queue.clear();
}
private void listFiles(File file) {
if (!file.exists()) {
System.out.println(file + " does not exist.");
}
if (file.isDirectory()) {
for (File f : file.listFiles()) {
listFiles(f);
}
} else {
String filename = file.getName().toLowerCase();
if (filename.endsWith(".htm") || filename.endsWith(".html") ||
filename.endsWith(".xml") || filename.endsWith(".txt")) {
queue.add(file);
} else {
System.out.println("Skipped " + filename);
}
}
}
public void closeIndex() throws IOException {
writer.optimize();
writer.close();
}
}
hi guys,
the problem is in bold. what i would like to have is to get the directory of my files.thanks. but i unable to do so.
-
Sorry You are very difficult understand what you want.
Do you want to index a special folder?
-
hi,
yes.its a special folder which consist of "tdt3/19982001/(and the text files). how should i be able to get the directory of this ?thanks.i am able to build the index. but when i try to do a search index, i dont know how to get the directory. i try manually but its said "access is denied" thanks.
-
see
Code:
package com.action;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
public class LuceneHtml {
public static final String FILES_TO_INDEX_DIRECTORY = "filesToIndex";
public static final String INDEX_DIRECTORY = "indexDirectory";
public static final String FIELD_PATH = "path";
public static final String FIELD_CONTENTS = "contents";
public static void main(String[] args) throws Exception {
createIndex();
searchIndex("msg");
}
public static void createIndex() throws IOException {
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
File index = new File(INDEX_DIRECTORY);
IndexWriter indexWriter = new IndexWriter(FSDirectory.open(index), analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
File dir = new File(FILES_TO_INDEX_DIRECTORY);
File[] files = dir.listFiles();
for (File file : files) {
Document document = new Document();
String path = file.getCanonicalPath();
document.add(new Field(FIELD_PATH, path, Field.Store.YES, Field.Index.ANALYZED));
Reader reader = new FileReader(file);
document.add(new Field(FIELD_CONTENTS, reader));
indexWriter.addDocument(document);
}
indexWriter.optimize();
indexWriter.close();
}
public static void searchIndex(String searchString) throws Exception {
System.out.println("Searching for '" + searchString + "'");
TopDocsCollector collector = TopFieldCollector.create(new Sort(new SortField(null, SortField.DOC, true)), 100, false, false, false, false);
Directory directory = FSDirectory.open(new File(INDEX_DIRECTORY));
IndexReader indexReader = IndexReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
QueryParser queryParser = new QueryParser(Version.LUCENE_30, FIELD_CONTENTS, analyzer);
Query query = queryParser.parse(searchString);
indexSearcher.search(query, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
System.out.println("Number of hits: " + hits.length);
for (ScoreDoc hit : hits) {
Document document = indexSearcher.doc(hit.doc);
String path = document.get(FIELD_PATH);
System.out.println("Hit: " + path);
}
}
}
-
hi,
may i know what lucene 3.0 is?thanks.
-
I use last version Lucune 3.0.3.
see Version (Lucene 3.0.1 API)
What version do you use?
-
hi,
tried your codes but i still have access denied. This tdt3 has many sub directories inside
tdt3/1998/1.txt, tdt3/1999/1.txt (contain many date files)
-
hi,
the error : Exception in thread "main" java.io.FileNotFoundException: tdt3\19981001 (Access is denied)
-
Can you show a full structure your project?
This is structure my project
Code:
.
|-- filesToIndex
| `-- overview.html
|-- indexDirectory
| |-- _0.cfs
| |-- _0.cfx
| |-- segments_2
| `-- segments.gen
|-- lib
| |-- lucene-core-3.0.3.jar
| `-- lucene-highlighter-3.0.3.jar
|-- LuceneHtml.iml
|-- out
| |-- production
| | `-- LuceneHtml
| | `-- com
| | `-- action
| | `-- LuceneHtml.class
| `-- test
| `-- LuceneHtml
`-- src
`-- com
`-- action
`-- LuceneHtml.java
-
hi,
hmm..is you dont mind, i have inserted a link of pictures to show the structures of the document collection. thanks.[IMG]1.jpeg[/IMG] [IMG]2.jpeg[/IMG]
-
-
Ok. I looked your pictures. So you just can use recursive for reading all structure index folders. It's easy. if you want I would to write this code for you.
-
hi,
how should i go about doin the recursive?and able to add in the files as in the code
Directory directory = FSDirectory.getDirectory(INDEX_COLLECTION); or something that makes me able to do a search index. i know the building part is right, just cant connect it to the search index without passing parameter. cant think of any other way. thanks
-
Yeah I looked your code. You used a recursive. so you have a collection of files for index.
then you index each file. That is all.
for example
Code:
for (File file : files) {
createIndex(file);
}
Code:
public static void createIndex(File index) throws IOException {
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
IndexWriter indexWriter = new IndexWriter(FSDirectory.open(index), analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
File dir = new File(FILES_TO_INDEX_DIRECTORY);
File[] files = dir.listFiles();
for (File file : files) {
Document document = new Document();
String path = file.getCanonicalPath();
document.add(new Field(FIELD_PATH, path, Field.Store.YES, Field.Index.ANALYZED));
Reader reader = new FileReader(file);
document.add(new Field(FIELD_CONTENTS, reader));
indexWriter.addDocument(document);
}
indexWriter.optimize();
indexWriter.close();
}
-
hi,
Code:
for (File file : files) {
createIndex(file);
}
i only have this for loop at the createIndex() method only. Do you mean i supposed to put it inside?
-
hi,
what argument should i put at the createIndex(..) at the main method?thanks.
-
hi,
Code:
public static void createIndex() throws CorruptIndexException, LockObtainFailedException, IOException {
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_24);
IndexWriter indexWriter = new IndexWriter(INDEX_COLLECTION, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
File dir = new File(FILES_TO_INDEX_DIRECTORY);
FilenameFilter select = new FileListFilter("txt");
File[] folder = dir.listFiles();
//File[] files = dirsub;
for (int i = 0; i< folder.length;i++) {
folder[i].getName();
File[] sub = folder[i].listFiles();
for(File subfiles: sub) {
Document document = new Document();
String path = subfiles.getCanonicalPath();
document.add(new Field(FIELD_PATH, path, Field.Store.YES, Field.Index.UN_TOKENIZED));
String docno = subfiles.getName();
document.add(new Field(FIELD_ID, docno, Field.Store.YES, Field.Index.UN_TOKENIZED));
String date = subfiles.getPath();
document.add(new Field(FIELD_DATE, date, Field.Store.YES, Field.Index.UN_TOKENIZED));
Reader reader = new FileReader(subfiles);
document.add(new Field(FIELD_CONTENTS, reader));
indexWriter.addDocument(document);
}
}
indexWriter.optimize();
indexWriter.close();
}
the lucene running and not doin anything
and my main method
Code:
public static void main(String[] args) throws Exception {
createIndex();
System.out.println("search");
BufferedReader br = new BufferedReader(
new InputStreamReader(System.in));
String s = br.readLine();
searchIndex(s);
}
-
I cant get the query. this is my codes. somehow it returned empty hits.
Code:
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Iterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hit;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
public class SearchEngine {
//declare all the fields
public static final String FILES_TO_INDEX_DIRECTORY = "tdt3/";
public static final String INDEX_COLLECTION = "indexDirectory";
public static final String FIELD_PATH = "path";
public static final String FIELD_CONTENTS = "contents";
public static final String FIELD_ID = "docno";
public static final String FIELD_DATE = "date";
public static void main(String[] args) throws Exception {
//createIndex();
System.out.println("search");
BufferedReader br = new BufferedReader(
new InputStreamReader(System.in));
String s = br.readLine();
searchIndex(s);
}
public static void createIndex() throws CorruptIndexException, LockObtainFailedException, IOException {
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_24);
IndexWriter indexWriter = new IndexWriter(INDEX_COLLECTION, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
File dir = new File(FILES_TO_INDEX_DIRECTORY);
FilenameFilter select = new FileListFilter("txt");
File[] folder = dir.listFiles();
for (int i = 0; i < folder.length; i++) {
File[] sub = folder[i].listFiles(select);
for (File subfiles : sub) {
Document document = new Document();
String path = subfiles.getPath();
document.add(new Field(FIELD_PATH, path, Field.Store.YES, Field.Index.UN_TOKENIZED));
String docno = subfiles.getName();
document.add(new Field(FIELD_ID, docno, Field.Store.YES, Field.Index.UN_TOKENIZED));
String date = subfiles.getPath();
document.add(new Field(FIELD_DATE, date, Field.Store.YES, Field.Index.UN_TOKENIZED));
BufferedReader reader = new BufferedReader(new FileReader(subfiles));
document.add(new Field(FIELD_CONTENTS, reader));
System.out.println("added" + subfiles);
indexWriter.addDocument(document);
}
}
indexWriter.optimize();
indexWriter.close();
}
public static void searchIndex(String words) throws IOException, ParseException {
System.out.println("Searching for '" + words + "'");
FSDirectory.open(new File("tdt3/"));
//IndexReader indexReader = IndexReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(INDEX_COLLECTION);
Analyzer analyzer = new StandardAnalyzer();
QueryParser queryParser = new QueryParser(FIELD_PATH, analyzer);
Query query = queryParser.parse(words);
Hits hits = indexSearcher.search(query);
System.out.println("Number of hits: " + hits.length());
Iterator<Hit> it = hits.iterator();
while (it.hasNext()) {
Hit hit = it.next();
Document document = hit.getDocument();
String text = document.get(FIELD_PATH);
System.out.println("Hit: " + text);
}
}
public static class FileListFilter implements FilenameFilter {
private String extension;
public FileListFilter(String extension) {
this.extension = extension;
}
public boolean accept(File directory, String filename) {
boolean fileOK = true;
if (extension != null) {
fileOK &= filename.endsWith('.' + extension);
}
return fileOK;
}
}
}
thanks for the feedback, i am still trying to spot where i did wrong.
-
I advise you to figure out how to make index for one file. Then you understand how you can index all files in index folder.
-
hi petr,
i have tested with one file only and it worked. and the problem yesterday regarding reading subfolder also has been solved. i am using nested loop to read the folder and its subfolder.
but now i am trying to search the index but the number of hits returned 0.