Results 1 to 10 of 10
Thread: Lucene indexing problem
- 12-04-2012, 04:19 PM #1
Member
- Join Date
- Feb 2010
- Posts
- 11
- Rep Power
- 0
Lucene indexing problem
Hi all, i'm creating a Lucene search engine for a whole bunch of newspaper excerpts. My problem is sorting these articles into an index before I perform some search functions on them. Below is my code so far
As you may see, there are 9 desired fields I wish to save but I cannot figure out how to read the information held within the documents, for example below is a small section from one of the documents.Java Code:package org.apache.lucene; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import java.io.*; import java.util.ArrayList; /** * This terminal application creates an Apache Lucene index in a folder and adds files into this index * based on the input of the user. */ public class TextFileIndexer { private static StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_40); private IndexWriter writer; private ArrayList<File> queue = new ArrayList<File>(); public static void main(String[] args) throws IOException { System.out.println("Enter the path where the index will be created"); String indexLocation = null; BufferedReader br = new BufferedReader( new InputStreamReader(System.in)); String s = br.readLine(); TextFileIndexer indexer = null; try { indexLocation = s; indexer = new TextFileIndexer(s); } catch (Exception ex) { System.out.println("Cannot create index..." + ex.getMessage()); System.exit(-1); } //=================================================== //read input from user until he enters q for quit //=================================================== while (!s.equalsIgnoreCase("q")) { try { System.out.println("Enter the full path to add into the index"); System.out.println("[Acceptable file types: .xml, .html, .html, .txt]"); s = br.readLine(); if (s.equalsIgnoreCase("q")) { break; } //try to add file into the index indexer.indexFileOrDirectory(s); } catch (Exception e) { System.out.println("Error indexing " + s + " : " + e.getMessage()); } } //=================================================== //after adding, we always have to call the //closeIndex, otherwise the index is not created //=================================================== indexer.closeIndex(); //========================================================= // Now search //========================================================= IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation))); IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollector collector = TopScoreDocCollector.create(5, true); s = ""; while (!s.equalsIgnoreCase("q")) { try { System.out.println("Enter the search query (q=quit):"); s = br.readLine(); if (s.equalsIgnoreCase("q")) { break; } Query q = new QueryParser(Version.LUCENE_40, "contents", analyzer).parse(s); searcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // 4. display results System.out.println("Found " + hits.length + " hits."); for(int i=0;i<hits.length;++i) { int docId = hits[i].doc; Document d = searcher.doc(docId); System.out.println((i + 1) + ". " + d.get("path") + " score=" + hits[i].score); } } catch (Exception e) { System.out.println("Error searching " + s + " : " + e.getMessage()); } } } /** * Constructor * @param indexDir the name of the folder in which the index should be created * @throws java.io.IOException when exception creating index. */ TextFileIndexer(String indexDir) throws IOException { // the boolean true parameter means to create a new index everytime, // potentially overwriting any existing files there. FSDirectory dir = FSDirectory.open(new File(indexDir)); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer); writer = new IndexWriter(dir, config); } /** * Indexes a file or directory * @param fileName the name of a text file or a folder we wish to add to the index * @throws java.io.IOException when exception */ public void indexFileOrDirectory(String fileName) throws IOException { //=================================================== //gets the list of files in a folder (if user has submitted //the name of a folder) or gets a single file name (is user //has submitted only the file name) //=================================================== addFiles(new File(fileName)); int originalNumDocs = writer.numDocs(); for (File f : queue) { FileReader fr = null; try { Document doc = new Document(); //=================================================== // add contents of file //=================================================== fr = new FileReader(f); doc.add(new StringField("DOCNO", f.getPath(), Field.Store.YES)); doc.add(new StringField("DOCID", f.getName(), Field.Store.YES)); doc.add(new StringField("HEADLINE", f.getName(), Field.Store.YES)); doc.add(new StringField("DATE", f.getName(), Field.Store.YES)); doc.add(new StringField("SOURCE", f.getName(), Field.Store.YES)); doc.add(new StringField("COMPANY", f.getName(), Field.Store.YES)); doc.add(new StringField("INDUSTRY", f.getName(), Field.Store.YES)); doc.add(new StringField("INTRODUCTION", f.getName(), Field.Store.YES)); doc.add(new StringField("ARTICLE", f.getName(), Field.Store.YES)); writer.addDocument(doc); System.out.println("Added: " + f); } catch (Exception e) { System.out.println("Could not add: " + f); } finally { fr.close(); } } int newNumDocs = writer.numDocs(); System.out.println(""); System.out.println("************************"); System.out.println((newNumDocs - originalNumDocs) + " documents added."); System.out.println("************************"); queue.clear(); } private void addFiles(File file) { if (!file.exists()) { System.out.println(file + " does not exist."); } if (file.isDirectory()) { for (File f : file.listFiles()) { addFiles(f); } } else { String filename = file.getName().toLowerCase(); //=================================================== // Only index text files //=================================================== if (filename.endsWith("")) { queue.add(file); } else { System.out.println("Skipped " + filename); } } } /** * Close the index. * @throws java.io.IOException when exception closing */ public void closeIndex() throws IOException { writer.close(); } }
Ordinarily I would just use a standard xml reader but there are no root nodes in the documents. Instead I have readied a file reader but I'm not sure how I would go about extracting the information between tags.XML Code:<DOC> <DOCNO> WSJ900402-0193 </DOCNO> <DOCID> 900402-0193. </DOCID> <HL> Parent of the Department Store Units Of Campeau Files Under Chapter 11 ---- By Robert Melnbardis Staff Reporter of The Wall Street Journal </HL> <DATE> 04/02/90 </DATE> <SO> WALL STREET JOURNAL (J), PAGE B7 </SO> <CO> CMAFC </CO> <IN> RETAILING (RET) BANKRUPTCIES (BCY) BOND MARKET NEWS (BON) </IN> <LP> TORONTO -- The U.S. parent of Campeau Corp.'s department store units filed for Chapter 11 bankruptcy-law protection in what it said was a "procedural step" in the continuing Chapter 11 reorganization of the retail units. Federated Stores Inc., formerly Campeau Corp. (U.S), said it filed for Chapter 11 protection in U.S. Bankruptcy Court in San Francisco. That is the same court where three Federated holding companies and a U.S. Campeau real estate unit filed for Chapter 11 protection on Jan. 15. </LP> <TEXT> Federated is the holding company for Federated Department Stores Inc. and Allied Stores Corp., which filed for Chapter 11 protection in Cincinnati on Jan. 15. It also holds two U.S. real estate units and a joint-venture development partnership with Edward J. DeBartolo Corp., a Youngstown, Ohio, shopping mall developer that lent Campeau $480 million. Federated said its filing is "meaningless" to Federated Department Stores and Allied Stores, and won't affect their ability to meet their continuing obligations in the normal course of business. It said the filing doesn't affect Ralphs Grocery Co., Campeau's Southern California supermarket chain, which isn't under Chapter 11 protection. Federated and Allied are receiving merchandise shipments from 98% of their "key" vendors and their department store inventories are on a par with year-earlier levels, said Allen Questrom, chairman and chief executive officer, and James Zimmerman, president, in a statement. A spokeswoman for Federated said she couldn't comment on whether the bankruptcy filing would effectively freeze the Campeau/DeBartolo mall development partnership. The spokeswoman also said Federated's filing wasn't related to the Canadian parent's latest cash woes. Campeau Corp. didn't pay about $18.9 million of interest due last Saturday on two debenture issues, and for the second straight month likely will skip a $5.2 million monthly interest payment due today to two major creditors. Campeau said the debentures will be in default if the payments aren't made by April 30, and it wants to defer paying the interest until it has a new business plan for its Canadian and U.S. real estate operations. Campeau is already in default on $705 million of loans from Olympia & York Developments Ltd. and DeBartolo because it deliberately declined to pay February's $5.2 million interest payment. A Campeau spokesman said it wouldn't be in the company's best interest to pay the interest now. He said Campeau is "pretty confident" the debenture holders will agree to defer the payments. Approval of 66% of the debenture holders is needed for such a deferral. He said the company is up to date on paying interest due on other loans. Olympia & York declined to comment, but a source close to the real estate development concern closely held by Toronto's Reichmann family said it probably would be amenable to deferring payments. DeBartolo officials weren't available for comment. Analysts weren't surprised by Campeau's decision to skip the interest payments on its debentures. "It wasn't totally unexpected given the negotiations they must get through with their creditors on debt owed by the parent company," said Ross Cowan, of Toronto-based Levesque Beaubien Geoffrion Inc. Analysts expect Campeau's business plan will include renegotiated loan agreements and an accelerated program of asset sales, including some of Campeau's prime properties that are encumbered under loans. Campeau's debt crisis became acute when Federated and Allied sought Chapter 11 bankruptcy-law protection. The Canadian parent company needed the cash flow from Federated and Allied to service debt taken on during its $6.6 billion takeover of Federated in 1988. The interest payments Campeau missed Saturday include about $13 million due on its $260 million of 7% convertible debentures, all of which are held by Olympia & York, and 6.9 million Canadian dollars (US$5.8 million) due on C$184 million of 7.5% convertible debentures, of which C$46.6 million is held by Olympia & York. The interest rate on the 7% debentures was increased to 9.86% last September. </TEXT> </DOC> <DOC> <DOCNO> WSJ900402-0192 </DOCNO> <DOCID> 900402-0192. </DOCID> <HL> VLSI to Post Profit Matching Forecasts For the First Quarter </HL> <DATE> 04/02/90 </DATE> <SO> WALL STREET JOURNAL (J), PAGE A8B </SO> <CO> VLSI </CO> <IN> DOW JONES INTERVIEW (CEO) </IN> <LP> NEW YORK -- VLSI Technology Inc.'s first-quarter earnings should meet analysts' expectations, the company's chairman and chief executive officer, Alfred J. Stein, said. "We expect to do as well as the analysts are projecting . . . between five and eight cents a share," Mr. Stein added. VLSI makes standard and customized integrated circuits. </LP> <TEXT> Mr. Stein noted that late last year, the company guided analysts' first-quarter projections lower from earlier estimates of around 15 cents a share. He said a slowdown in standard chip-set sales and a drop in demand for custom chips by its largest customer, Apple Computer Corp., stalled revenue growth in the quarter. The company had a loss of $6.3 million in the first quarter of 1989, largely due to problems during the start-up of its chip plant in San Antonio, Texas. VLSI earned 11 cents a share in the latest fourth quarter. The company expects to release its first-quarter earnings April 12. Mr. Stein said seasonal slowdown in Far Eastern demand for the chip sets was partly responsible for damping growth in the first quarter. The region's IBM-compatible computer makers use the sets in personal computers that see their strongest sales before the Christmas holidays. Far Eastern demand generally slacks off in the first quarter, Mr. Stein said. Shipments to the Far East account for about half of the company's chip-set sales. Mr. Stein said demand for customized chips by Apple Computer has recovered from a drop that also depressed first-quarter revenue. "Apple is coming back very strongly to us," Mr. Stein asserted. He added, however, that the impact of Apple's resumed demand won't be felt until the second and third quarters of this year. Apple accounted for 13% of VLSI's revenue in 1989, while sales to International Business Machines Corp. rose to about 10% of total revenue. Mr. Stein said sales to IBM will exceed sales to Apple this year because of increasing shipments to IBM, not because of shrinking sales to Apple. The increasing importance of IBM as a customer illustrates VLSI's strategic shift toward sales of "application-specific standard product," primarily standardized chip sets for personal computers, over the customized chips designed for Apple Computer and others. </TEXT> </DOC>
Any help would be greatly appreciated!
Thanks
- 12-04-2012, 05:00 PM #2
- Join Date
- Sep 2008
- Location
- Voorschoten, the Netherlands
- Posts
- 11,405
- Blog Entries
- 7
- Rep Power
- 17
Re: Lucene indexing problem
You can always simulate/fake a root element <root> ... </root> and let the xml stuff do the rest. Faking the root element can be done by an InputStream or Reader that wraps the original InputStream/Reader.
kind regards,
JosWhen people rob a bank they get a penalty; when banks rob people they get a bonus.
- 12-04-2012, 05:14 PM #3
Member
- Join Date
- Feb 2010
- Posts
- 11
- Rep Power
- 0
Re: Lucene indexing problem
Sounds good, could I use a HTML parser also? I tried using JSoup but I could only figure out how to parse strings.
- 12-04-2012, 05:26 PM #4
- Join Date
- Sep 2008
- Location
- Voorschoten, the Netherlands
- Posts
- 11,405
- Blog Entries
- 7
- Rep Power
- 17
- 12-04-2012, 06:38 PM #5
Member
- Join Date
- Feb 2010
- Posts
- 11
- Rep Power
- 0
Re: Lucene indexing problem
Okay I've used jsoup to extract the information from the angled brackets however I have another question. How do I get it to parse type fileReader instead of type string. My code for parsing is as below
EDIT Altered the code as shown below, now it reads one of my document files, the problem is, there are tons of DOCs within 1 file and when I print (with the last command) it only gives me the first DOC returned.
Java Code:import java.io.File; import java.io.FileReader; import java.io.IOException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; public class TestClass2 { public static void main(String args[]) throws IOException { FileReader fr = null; File input = new File("WSJ_0402"); Document doc = Jsoup.parse(input, "UTF-8", "http://example.com/"); //extracts the data from the <>'s Element DOC = doc.select("DOC").first(); Element DOCNO = doc.select("DOCNO").first(); Element DOCID = doc.select("DOCID").first(); Element HEADLINE = doc.select("HL").first(); Element DATE = doc.select("DATE").first(); Element SOURCE = doc.select("SO").first(); Element COMPANY = doc.select("CO").first(); Element INDUSTRY = doc.select("IN").first(); Element INTRODUCTION = doc.select("LP").first(); Element ARTICLE = doc.select("TEXT").first(); //just changes the data inside the <>'s to a string String linkText = DOC.text(); String linkText2 = DOCNO.text(); String linkText3 = DOCID.text(); String linkText4 = HEADLINE.text(); String linkText5 = DATE.text(); String linkText6 = SOURCE.text(); String linkText7 = COMPANY.text(); String linkText8 = INDUSTRY.text(); String linkText9 = INTRODUCTION.text(); String linkText10 = ARTICLE.text(); System.out.println(linkText); } }Last edited by Blacky777; 12-04-2012 at 06:49 PM.
- 12-04-2012, 06:45 PM #6
- Join Date
- Sep 2008
- Location
- Voorschoten, the Netherlands
- Posts
- 11,405
- Blog Entries
- 7
- Rep Power
- 17
- 12-04-2012, 07:12 PM #7
Member
- Join Date
- Feb 2010
- Posts
- 11
- Rep Power
- 0
Re: Lucene indexing problem
haha not the biggest fan of soup either

Can finally read all instances of each <>
now i just need to get the 2 classes to communicate, WSJ_0402 is the name of 1 of the html files.
EDIT A problem I face is that when I try to add a string to a field, it adds all instances as 1 entry for example i try to add dates asJava Code:import java.io.File; import java.io.FileReader; import java.io.IOException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class TestClass2 { public static void main(String args[]) throws IOException { FileReader fr = null; File input = new File("WSJ_0402"); Document doc = Jsoup.parse(input, "UTF-8", "http://example.com/"); //extracts the data from the <>'s Elements DOC = doc.select("DOC"); Elements DOCNO = doc.select("DOCNO"); Elements DOCID = doc.select("DOCID"); Elements HEADLINE = doc.select("HL"); Elements DATE = doc.select("DATE"); Elements SOURCE = doc.select("SO"); Elements COMPANY = doc.select("CO"); Elements INDUSTRY = doc.select("IN"); Elements INTRODUCTION = doc.select("LP"); Elements ARTICLE = doc.select("TEXT"); //just changes the data inside the <>'s to a string String linkText = DOC.text(); String linkText2 = DOCNO.text(); String linkText3 = DOCID.text(); String linkText4 = HEADLINE.text(); String linkText5 = DATE.text(); String linkText6 = SOURCE.text(); String linkText7 = COMPANY.text(); String linkText8 = INDUSTRY.text(); String linkText9 = INTRODUCTION.text(); String linkText10 = ARTICLE.text(); } }
Luke (a .jar that displays my index) shows dates has 1 entry: 04/02/90 04/02/90 05/02/90......... is there a way to separate the terms? I know .first() gets the first term, but is there a way to 'pop' this term instead of 'peeking'?Java Code:ind.add(new StringField("DATE", linkText5, null));Last edited by Blacky777; 12-04-2012 at 07:48 PM.
- 12-04-2012, 08:49 PM #8
- Join Date
- Sep 2008
- Location
- Voorschoten, the Netherlands
- Posts
- 11,405
- Blog Entries
- 7
- Rep Power
- 17
Re: Lucene indexing problem
Sorry, I'm bailing out here: I don't know jsoup and I don't know Luke (is that also some kind of soup?)
kind regards,
JosWhen people rob a bank they get a penalty; when banks rob people they get a bonus.
- 12-04-2012, 09:02 PM #9
Member
- Join Date
- Feb 2010
- Posts
- 11
- Rep Power
- 0
Re: Lucene indexing problem
Thanks for your assistance so far JosAH
I've managed to index the terms individually, the code is as shown below
Problem is though, that this adds nearly 20,000 documents (same number of total field entries) when there should be 194. Some have missing fields which adds to the problem further too.Java Code:package org.apache.lucene; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.*; import java.nio.CharBuffer; import java.util.ArrayList; /** * This terminal application creates an Apache Lucene index in a folder and adds files into this index * based on the input of the user. */ public class TextFileIndexer { private static StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_40); private IndexWriter writer; private ArrayList<File> queue = new ArrayList<File>(); public static void main(String[] args) throws IOException { System.out.println("Enter the path where the index will be created"); String indexLocation = null; BufferedReader br = new BufferedReader( new InputStreamReader(System.in)); String s = br.readLine(); TextFileIndexer indexer = null; try { indexLocation = s; indexer = new TextFileIndexer(s); } catch (Exception ex) { System.out.println("Cannot create index..." + ex.getMessage()); System.exit(-1); } //=================================================== //read input from user until he enters q for quit //=================================================== while (!s.equalsIgnoreCase("q")) { try { System.out.println("Enter the full path to add into the index"); System.out.println("[Acceptable file types: .xml, .html, .html, .txt]"); s = br.readLine(); if (s.equalsIgnoreCase("q")) { break; } //try to add file into the index indexer.indexFileOrDirectory(s); } catch (Exception e) { System.out.println("Error indexing " + s + " : " + e.getMessage()); } } //=================================================== //call the closeIndex, otherwise the index is not created //=================================================== indexer.closeIndex(); //========================================================= // Now search //========================================================= IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation))); IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollector collector = TopScoreDocCollector.create(5, true); s = ""; while (!s.equalsIgnoreCase("q")) { try { System.out.println("Enter the search query (q=quit):"); s = br.readLine(); if (s.equalsIgnoreCase("q")) { break; } Query q = new QueryParser(Version.LUCENE_40, "contents", analyzer).parse(s); searcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // 4. display results System.out.println("Found " + hits.length + " hits."); for(int i=0;i<hits.length;++i) { int docId = hits[i].doc; Document d = searcher.doc(docId); System.out.println((i + 1) + ". " + d.get("path") + " score=" + hits[i].score); } } catch (Exception e) { System.out.println("Error searching " + s + " : " + e.getMessage()); } } } /** * Constructor * @param indexDir the name of the folder in which the index should be created * @throws java.io.IOException when exception creating index. */ TextFileIndexer(String indexDir) throws IOException { // the boolean true parameter means to create a new index everytime, // potentially overwriting any existing files there. FSDirectory dir = FSDirectory.open(new File(indexDir)); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer); writer = new IndexWriter(dir, config); } /** * Indexes a file or directory * @param fileName the name of a text file or a folder we wish to add to the index * @throws java.io.IOException when exception */ public void indexFileOrDirectory(String fileName) throws IOException { //=================================================== //gets the list of files in a folder (if user has submitted //the name of a folder) or gets a single file name (is user //has submitted only the file name) //=================================================== addFiles(new File(fileName)); int originalNumDocs = writer.numDocs(); for (File f : queue) { FileReader fr = null; FileReader frDate = null; try { Document ind = new Document(); //=================================================== // add contents of file //=================================================== fr = new FileReader(f); File input = new File("WSJ_0402"); //only 1 file right now org.jsoup.nodes.Document doc = Jsoup.parse(input, "UTF-8", "http://example.com/"); //extracts the data from the <>'s Elements DOC = doc.select("DOC"); Elements DOCNO = doc.select("DOCNO"); Elements DOCID = doc.select("DOCID"); Elements HEADLINE = doc.select("HL"); Elements DATE = doc.select("DATE"); Elements SOURCE = doc.select("SO"); Elements COMPANY = doc.select("CO"); Elements INDUSTRY = doc.select("IN"); Elements INTRODUCTION = doc.select("LP"); Elements ARTICLE = doc.select("TEXT"); //changes the data inside the <>'s to a string and adds the data to //relevant field for each instance of DOC for (int i=0; i<DOC.size();i++){ String linkText2 = DOCNO.get(i).toString(); ind.add(new StringField("DOCNO", linkText2, null)); String linkText3 = DOCID.get(i).toString(); ind.add(new StringField("DOCID", linkText3, null)); String linkText4 = HEADLINE.get(i).toString(); ind.add(new StringField("HEADLINE", linkText4, null)); String linkText5 = DATE.get(i).toString(); ind.add(new StringField("DATE", linkText5, null)); String linkText6 = SOURCE.get(i).toString(); ind.add(new StringField("SOURCE", linkText6, null)); String linkText7 = COMPANY.get(i).toString(); ind.add(new StringField("COMPANY", linkText7, null)); String linkText8 = INDUSTRY.get(i).toString(); ind.add(new StringField("INDUSTRY", linkText8, null)); String linkText9 = INTRODUCTION.get(i).toString(); ind.add(new StringField("INTRODUCTION", linkText9, null)); String linkText10 = ARTICLE.get(i).toString(); ind.add(new StringField("ARTICLE", linkText10, null)); writer.addDocument(ind); } System.out.println("Added: " + f); } catch (Exception e) { System.out.println("Could not add: " + f); } finally { fr.close(); } } int newNumDocs = writer.numDocs(); System.out.println(""); System.out.println("************************"); System.out.println(writer.numDocs() + " terms added."); System.out.println("************************"); queue.clear(); } private void addFiles(File file) { if (!file.exists()) { System.out.println(file + " does not exist."); } if (file.isDirectory()) { for (File f : file.listFiles()) { addFiles(f); } } else { String filename = file.getName().toLowerCase(); //=================================================== // Only index text files //=================================================== if (filename.endsWith("") || filename.endsWith(".html") || filename.endsWith(".xml") || filename.endsWith(".txt")) { queue.add(file); } else { System.out.println("Skipped " + filename); } } } /** * Close the index. * @throws java.io.IOException when exception closing */ public void closeIndex() throws IOException { writer.close(); } }
- 12-05-2012, 02:43 PM #10
Member
- Join Date
- Feb 2010
- Posts
- 11
- Rep Power
- 0
Re: Lucene indexing problem
Can somebody tell me what the null value points to in this command
It says Store stored but I'm not sure what this means? Is this maybe why all my stored fields are not connected to one another as an individual document?Java Code:ind.add(new StringField("DOCID", linkText3, null))
Similar Threads
-
Lucene indexing help
By JP10 in forum LuceneReplies: 1Last Post: 07-25-2011, 05:34 PM -
Indexing XML using lucene
By peliukasss in forum LuceneReplies: 0Last Post: 03-28-2010, 10:20 PM -
[SOLVED] Parallel/Asynchronous Faster indexing way on lucene
By priyanka.dandekar in forum LuceneReplies: 2Last Post: 10-06-2008, 08:20 PM -
Lucene indexing ans searching code needed
By vgarg80 in forum JavaServer Pages (JSP) and JSTLReplies: 0Last Post: 06-07-2008, 11:31 AM -
Lucene Re-Indexing
By connect2srinath in forum LuceneReplies: 1Last Post: 05-11-2008, 05:35 PM


LinkBack URL
About LinkBacks
Reply With Quote

Bookmarks