Results 1 to 10 of 10
  1. #1
    Blacky777 is offline Member
    Join Date
    Feb 2010
    Posts
    11
    Rep Power
    0

    Default Lucene indexing problem

    Hi all, i'm creating a Lucene search engine for a whole bunch of newspaper excerpts. My problem is sorting these articles into an index before I perform some search functions on them. Below is my code so far

    Java Code:
    package org.apache.lucene;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.document.StringField;
    import org.apache.lucene.document.TextField;
    import org.apache.lucene.index.DirectoryReader;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.queryparser.classic.QueryParser;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.TopScoreDocCollector;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    
    import java.io.*;
    import java.util.ArrayList;
    
    /**
     * This terminal application creates an Apache Lucene index in a folder and adds files into this index
     * based on the input of the user.
     */
    public class TextFileIndexer {
      private static StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
    
      private IndexWriter writer;
      private ArrayList<File> queue = new ArrayList<File>();
    
    
      public static void main(String[] args) throws IOException {
        System.out.println("Enter the path where the index will be created");
    
        String indexLocation = null;
        BufferedReader br = new BufferedReader(
                new InputStreamReader(System.in));
        String s = br.readLine();
    
        TextFileIndexer indexer = null;
        try {
          indexLocation = s;
          indexer = new TextFileIndexer(s);
        } catch (Exception ex) {
          System.out.println("Cannot create index..." + ex.getMessage());
          System.exit(-1);
        }
    
        //===================================================
        //read input from user until he enters q for quit
        //===================================================
        while (!s.equalsIgnoreCase("q")) {
          try {
            System.out.println("Enter the full path to add into the index");
            System.out.println("[Acceptable file types: .xml, .html, .html, .txt]");
            s = br.readLine();
            if (s.equalsIgnoreCase("q")) {
              break;
            }
    
            //try to add file into the index
            indexer.indexFileOrDirectory(s);
          } catch (Exception e) {
            System.out.println("Error indexing " + s + " : " + e.getMessage());
          }
        }
    
        //===================================================
        //after adding, we always have to call the
        //closeIndex, otherwise the index is not created    
        //===================================================
        indexer.closeIndex();
    
        //=========================================================
        // Now search
        //=========================================================
        IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));
        IndexSearcher searcher = new IndexSearcher(reader);
        TopScoreDocCollector collector = TopScoreDocCollector.create(5, true);
    
        s = "";
        while (!s.equalsIgnoreCase("q")) {
          try {
            System.out.println("Enter the search query (q=quit):");
            s = br.readLine();
            if (s.equalsIgnoreCase("q")) {
              break;
            }
            Query q = new QueryParser(Version.LUCENE_40, "contents", analyzer).parse(s);
            searcher.search(q, collector);
            ScoreDoc[] hits = collector.topDocs().scoreDocs;
    
            // 4. display results
            System.out.println("Found " + hits.length + " hits.");
            for(int i=0;i<hits.length;++i) {
              int docId = hits[i].doc;
              Document d = searcher.doc(docId);
              System.out.println((i + 1) + ". " + d.get("path") + " score=" + hits[i].score);
            }
    
          } catch (Exception e) {
            System.out.println("Error searching " + s + " : " + e.getMessage());
          }
        }
    
      }
    
      /**
       * Constructor
       * @param indexDir the name of the folder in which the index should be created
       * @throws java.io.IOException when exception creating index.
       */
      TextFileIndexer(String indexDir) throws IOException {
        // the boolean true parameter means to create a new index everytime, 
        // potentially overwriting any existing files there.
        FSDirectory dir = FSDirectory.open(new File(indexDir));
    
    
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer);
    
        writer = new IndexWriter(dir, config);
      }
    
      /**
       * Indexes a file or directory
       * @param fileName the name of a text file or a folder we wish to add to the index
       * @throws java.io.IOException when exception
       */
      public void indexFileOrDirectory(String fileName) throws IOException {
        //===================================================
        //gets the list of files in a folder (if user has submitted
        //the name of a folder) or gets a single file name (is user
        //has submitted only the file name) 
        //===================================================
        addFiles(new File(fileName));
        
        int originalNumDocs = writer.numDocs();
        for (File f : queue) {
          FileReader fr = null;
          try {
            Document doc = new Document();
    
            //===================================================
            // add contents of file
            //===================================================
            fr = new FileReader(f);
            doc.add(new StringField("DOCNO", f.getPath(), Field.Store.YES));
            doc.add(new StringField("DOCID", f.getName(), Field.Store.YES));
            doc.add(new StringField("HEADLINE", f.getName(), Field.Store.YES));
            doc.add(new StringField("DATE", f.getName(), Field.Store.YES));
            doc.add(new StringField("SOURCE", f.getName(), Field.Store.YES));
            doc.add(new StringField("COMPANY", f.getName(), Field.Store.YES));
            doc.add(new StringField("INDUSTRY", f.getName(), Field.Store.YES));
            doc.add(new StringField("INTRODUCTION", f.getName(), Field.Store.YES));
            doc.add(new StringField("ARTICLE", f.getName(), Field.Store.YES));
            
    
            writer.addDocument(doc);
            System.out.println("Added: " + f);
          } catch (Exception e) {
            System.out.println("Could not add: " + f);
          } finally {
            fr.close();
          }
        }
        
        int newNumDocs = writer.numDocs();
        System.out.println("");
        System.out.println("************************");
        System.out.println((newNumDocs - originalNumDocs) + " documents added.");
        System.out.println("************************");
    
        queue.clear();
      }
    
      private void addFiles(File file) {
    
        if (!file.exists()) {
          System.out.println(file + " does not exist.");
        }
        if (file.isDirectory()) {
          for (File f : file.listFiles()) {
            addFiles(f);
          }
        } else {
          String filename = file.getName().toLowerCase();
          //===================================================
          // Only index text files
          //===================================================
          if (filename.endsWith("")) {
            queue.add(file);
          } else {
            System.out.println("Skipped " + filename);
          }
        }
      }
    
      /**
       * Close the index.
       * @throws java.io.IOException when exception closing
       */
      public void closeIndex() throws IOException {
        writer.close();
      }
    }
    As you may see, there are 9 desired fields I wish to save but I cannot figure out how to read the information held within the documents, for example below is a small section from one of the documents.

    XML Code:
    <DOC>
    <DOCNO>
    WSJ900402-0193
    </DOCNO>
    <DOCID>
    900402-0193.
    </DOCID>
    <HL>
       Parent of the Department Store Units
       Of Campeau Files Under Chapter 11
       ----
       By Robert Melnbardis
       Staff Reporter of The Wall Street Journal
    </HL>
    <DATE>
    04/02/90
    </DATE>
    <SO>
    WALL STREET JOURNAL (J), PAGE B7
    </SO>
    <CO>
       CMAFC
    </CO>
    <IN>
    RETAILING (RET)
    BANKRUPTCIES (BCY)
    BOND MARKET NEWS (BON)
    </IN>
    <LP>
       TORONTO -- The U.S. parent of Campeau Corp.'s department
    store units filed for Chapter 11 bankruptcy-law protection in
    what it said was a "procedural step" in the continuing
    Chapter 11 reorganization of the retail units.
       Federated Stores Inc., formerly Campeau Corp. (U.S), said
    it filed for Chapter 11 protection in U.S. Bankruptcy Court
    in San Francisco. That is the same court where three
    Federated holding companies and a U.S. Campeau real estate
    unit filed for Chapter 11 protection on Jan. 15.
    </LP>
    <TEXT>
       Federated is the holding company for Federated Department
    Stores Inc. and Allied Stores Corp., which filed for Chapter
    11 protection in Cincinnati on Jan. 15. It also holds two
    U.S. real estate units and a joint-venture development
    partnership with Edward J. DeBartolo Corp., a Youngstown,
    Ohio, shopping mall developer that lent Campeau $480 million.
       Federated said its filing is "meaningless" to Federated
    Department Stores and Allied Stores, and won't affect their
    ability to meet their continuing obligations in the normal
    course of business. It said the filing doesn't affect Ralphs
    Grocery Co., Campeau's Southern California supermarket chain,
    which isn't under Chapter 11 protection.
       Federated and Allied are receiving merchandise shipments
    from 98% of their "key" vendors and their department store
    inventories are on a par with year-earlier levels, said Allen
    Questrom, chairman and chief executive officer, and James
    Zimmerman, president, in a statement.
       A spokeswoman for Federated said she couldn't comment on
    whether the bankruptcy filing would effectively freeze the
    Campeau/DeBartolo mall development partnership. The
    spokeswoman also said Federated's filing wasn't related to
    the Canadian parent's latest cash woes.
       Campeau Corp. didn't pay about $18.9 million of interest
    due last Saturday on two debenture issues, and for the second
    straight month likely will skip a $5.2 million monthly
    interest payment due today to two major creditors.
       Campeau said the debentures will be in default if the
    payments aren't made by April 30, and it wants to defer
    paying the interest until it has a new business plan for its
    Canadian and U.S. real estate operations.
       Campeau is already in default on $705 million of loans
    from Olympia &amp; York Developments Ltd. and DeBartolo because
    it deliberately declined to pay February's $5.2 million
    interest payment.
       A Campeau spokesman said it wouldn't be in the company's
    best interest to pay the interest now. He said Campeau is
    "pretty confident" the debenture holders will agree to defer
    the payments. Approval of 66% of the debenture holders is
    needed for such a deferral. He said the company is up to date
    on paying interest due on other loans.
       Olympia &amp; York declined to comment, but a source close to
    the real estate development concern closely held by Toronto's
    Reichmann family said it probably would be amenable to
    deferring payments. DeBartolo officials weren't available for
    comment.
       Analysts weren't surprised by Campeau's decision to skip
    the interest payments on its debentures.
       "It wasn't totally unexpected given the negotiations they
    must get through with their creditors on debt owed by the
    parent company," said Ross Cowan, of Toronto-based Levesque
    Beaubien Geoffrion Inc.
       Analysts expect Campeau's business plan will include
    renegotiated loan agreements and an accelerated program of
    asset sales, including some of Campeau's prime properties
    that are encumbered under loans.
       Campeau's debt crisis became acute when Federated and
    Allied sought Chapter 11 bankruptcy-law protection. The
    Canadian parent company needed the cash flow from Federated
    and Allied to service debt taken on during its $6.6 billion
    takeover of Federated in 1988.
       The interest payments Campeau missed Saturday include
    about $13 million due on its $260 million of 7% convertible
    debentures, all of which are held by Olympia &amp; York, and 6.9
    million Canadian dollars (US$5.8 million) due on C$184
    million of 7.5% convertible debentures, of which C$46.6
    million is held by Olympia &amp; York. The interest rate on the
    7% debentures was increased to 9.86% last September.
    </TEXT>
    </DOC>
    <DOC>
    <DOCNO>
    WSJ900402-0192
    </DOCNO>
    <DOCID>
    900402-0192.
    </DOCID>
    <HL>
       VLSI to Post Profit
       Matching Forecasts
       For the First Quarter
    </HL>
    <DATE>
    04/02/90
    </DATE>
    <SO>
    WALL STREET JOURNAL (J), PAGE A8B
    </SO>
    <CO>
       VLSI
    </CO>
    <IN>
    DOW JONES INTERVIEW (CEO)
    </IN>
    <LP>
       NEW YORK -- VLSI Technology Inc.'s first-quarter earnings
    should meet analysts' expectations, the company's chairman
    and chief executive officer, Alfred J. Stein, said.
       "We expect to do as well as the analysts are projecting .
    . . between five and eight cents a share," Mr. Stein added.
    VLSI makes standard and customized integrated circuits.
    </LP>
    <TEXT>
       Mr. Stein noted that late last year, the company guided
    analysts' first-quarter projections lower from earlier
    estimates of around 15 cents a share.
       He said a slowdown in standard chip-set sales and a drop
    in demand for custom chips by its largest customer, Apple
    Computer Corp., stalled revenue growth in the quarter.
       The company had a loss of $6.3 million in the first
    quarter of 1989, largely due to problems during the start-up
    of its chip plant in San Antonio, Texas. VLSI earned 11 cents
    a share in the latest fourth quarter.
       The company expects to release its first-quarter earnings
    April 12.
       Mr. Stein said seasonal slowdown in Far Eastern demand for
    the chip sets was partly responsible for damping growth in
    the first quarter.
       The region's IBM-compatible computer makers use the sets
    in personal computers that see their strongest sales before
    the Christmas holidays. Far Eastern demand generally slacks
    off in the first quarter, Mr. Stein said. Shipments to the
    Far East account for about half of the company's chip-set
    sales.
       Mr. Stein said demand for customized chips by Apple
    Computer has recovered from a drop that also depressed
    first-quarter revenue.
       "Apple is coming back very strongly to us," Mr. Stein
    asserted. He added, however, that the impact of Apple's
    resumed demand won't be felt until the second and third
    quarters of this year.
       Apple accounted for 13% of VLSI's revenue in 1989, while
    sales to International Business Machines Corp. rose to about
    10% of total revenue.
       Mr. Stein said sales to IBM will exceed sales to Apple
    this year because of increasing shipments to IBM, not because
    of shrinking sales to Apple. The increasing importance of IBM
    as a customer illustrates VLSI's strategic shift toward sales
    of "application-specific standard product," primarily
    standardized chip sets for personal computers, over the
    customized chips designed for Apple Computer and others.
    </TEXT>
    </DOC>
    Ordinarily I would just use a standard xml reader but there are no root nodes in the documents. Instead I have readied a file reader but I'm not sure how I would go about extracting the information between tags.

    Any help would be greatly appreciated!
    Thanks

  2. #2
    JosAH's Avatar
    JosAH is online now Moderator
    Join Date
    Sep 2008
    Location
    Voorschoten, the Netherlands
    Posts
    13,728
    Blog Entries
    7
    Rep Power
    21

    Default Re: Lucene indexing problem

    You can always simulate/fake a root element <root> ... </root> and let the xml stuff do the rest. Faking the root element can be done by an InputStream or Reader that wraps the original InputStream/Reader.

    kind regards,

    Jos
    cenosillicaphobia: the fear for an empty beer glass

  3. #3
    Blacky777 is offline Member
    Join Date
    Feb 2010
    Posts
    11
    Rep Power
    0

    Default Re: Lucene indexing problem

    Sounds good, could I use a HTML parser also? I tried using JSoup but I could only figure out how to parse strings.

  4. #4
    JosAH's Avatar
    JosAH is online now Moderator
    Join Date
    Sep 2008
    Location
    Voorschoten, the Netherlands
    Posts
    13,728
    Blog Entries
    7
    Rep Power
    21

    Default Re: Lucene indexing problem

    Quote Originally Posted by Blacky777 View Post
    Sounds good, could I use a HTML parser also? I tried using JSoup but I could only figure out how to parse strings.
    You can use a html parser as much as you can use, say, a C++ parser for parsing Java ...

    kind regards,

    Jos
    cenosillicaphobia: the fear for an empty beer glass

  5. #5
    Blacky777 is offline Member
    Join Date
    Feb 2010
    Posts
    11
    Rep Power
    0

    Default Re: Lucene indexing problem

    Okay I've used jsoup to extract the information from the angled brackets however I have another question. How do I get it to parse type fileReader instead of type string. My code for parsing is as below

    EDIT Altered the code as shown below, now it reads one of my document files, the problem is, there are tons of DOCs within 1 file and when I print (with the last command) it only gives me the first DOC returned.

    Java Code:
    import java.io.File;
    import java.io.FileReader;
    import java.io.IOException;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    
    public class TestClass2
    {
     public static void main(String args[]) throws IOException
    {
    	FileReader fr = null;
    File input = new File("WSJ_0402");
    Document doc = Jsoup.parse(input, "UTF-8", "http://example.com/");
    
    //extracts the data from the <>'s
    Element DOC = doc.select("DOC").first();
    Element DOCNO = doc.select("DOCNO").first();
    Element DOCID = doc.select("DOCID").first();
    Element HEADLINE = doc.select("HL").first();
    Element DATE = doc.select("DATE").first();
    Element SOURCE = doc.select("SO").first();
    Element COMPANY = doc.select("CO").first();
    Element INDUSTRY = doc.select("IN").first();
    Element INTRODUCTION = doc.select("LP").first();
    Element ARTICLE = doc.select("TEXT").first();
    
    //just changes the data inside the <>'s to a string 
    String linkText = DOC.text();
    String linkText2 = DOCNO.text();
    String linkText3 = DOCID.text();
    String linkText4 = HEADLINE.text();
    String linkText5 = DATE.text();
    String linkText6 = SOURCE.text();
    String linkText7 = COMPANY.text();
    String linkText8 = INDUSTRY.text();
    String linkText9 = INTRODUCTION.text();
    String linkText10 = ARTICLE.text();
    
    System.out.println(linkText);
    
        }
    }
    Last edited by Blacky777; 12-04-2012 at 07:49 PM.

  6. #6
    JosAH's Avatar
    JosAH is online now Moderator
    Join Date
    Sep 2008
    Location
    Voorschoten, the Netherlands
    Posts
    13,728
    Blog Entries
    7
    Rep Power
    21

    Default Re: Lucene indexing problem

    Quote Originally Posted by Blacky777 View Post
    Okay I've used jsoup to extract the information from the angled brackets however I have another question. How do I get it to parse type fileReader instead of type string. My code for parsing is as below

    <snip>
    I don't know jsoup; aamof, I don't like any soup. Why not use an XMLReader? (it's in the Java core set of classes).

    kind regards,

    Jos
    cenosillicaphobia: the fear for an empty beer glass

  7. #7
    Blacky777 is offline Member
    Join Date
    Feb 2010
    Posts
    11
    Rep Power
    0

    Default Re: Lucene indexing problem

    haha not the biggest fan of soup either

    Can finally read all instances of each <>
    now i just need to get the 2 classes to communicate, WSJ_0402 is the name of 1 of the html files.

    Java Code:
    import java.io.File;
    import java.io.FileReader;
    import java.io.IOException;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    public class TestClass2
    {
     public static void main(String args[]) throws IOException
    {
    	FileReader fr = null;
    File input = new File("WSJ_0402");
    Document doc = Jsoup.parse(input, "UTF-8", "http://example.com/");
    
    //extracts the data from the <>'s
    Elements DOC = doc.select("DOC");
    Elements DOCNO = doc.select("DOCNO");
    Elements DOCID = doc.select("DOCID");
    Elements HEADLINE = doc.select("HL");
    Elements DATE = doc.select("DATE");
    Elements SOURCE = doc.select("SO");
    Elements COMPANY = doc.select("CO");
    Elements INDUSTRY = doc.select("IN");
    Elements INTRODUCTION = doc.select("LP");
    Elements ARTICLE = doc.select("TEXT");
    
    //just changes the data inside the <>'s to a string 
    String linkText = DOC.text();
    String linkText2 = DOCNO.text();
    String linkText3 = DOCID.text();
    String linkText4 = HEADLINE.text();
    String linkText5 = DATE.text();
    String linkText6 = SOURCE.text();
    String linkText7 = COMPANY.text();
    String linkText8 = INDUSTRY.text();
    String linkText9 = INTRODUCTION.text();
    String linkText10 = ARTICLE.text();
    
        }
    }
    EDIT A problem I face is that when I try to add a string to a field, it adds all instances as 1 entry for example i try to add dates as
    Java Code:
    ind.add(new StringField("DATE", linkText5, null));
    Luke (a .jar that displays my index) shows dates has 1 entry: 04/02/90 04/02/90 05/02/90......... is there a way to separate the terms? I know .first() gets the first term, but is there a way to 'pop' this term instead of 'peeking'?
    Last edited by Blacky777; 12-04-2012 at 08:48 PM.

  8. #8
    JosAH's Avatar
    JosAH is online now Moderator
    Join Date
    Sep 2008
    Location
    Voorschoten, the Netherlands
    Posts
    13,728
    Blog Entries
    7
    Rep Power
    21

    Default Re: Lucene indexing problem

    Sorry, I'm bailing out here: I don't know jsoup and I don't know Luke (is that also some kind of soup?)

    kind regards,

    Jos
    cenosillicaphobia: the fear for an empty beer glass

  9. #9
    Blacky777 is offline Member
    Join Date
    Feb 2010
    Posts
    11
    Rep Power
    0

    Default Re: Lucene indexing problem

    Thanks for your assistance so far JosAH

    I've managed to index the terms individually, the code is as shown below

    Java Code:
    package org.apache.lucene;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.document.StringField;
    import org.apache.lucene.document.TextField;
    import org.apache.lucene.index.DirectoryReader;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.queryparser.classic.QueryParser;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.TopScoreDocCollector;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import java.io.*;
    import java.nio.CharBuffer;
    import java.util.ArrayList;
    
    /**
     * This terminal application creates an Apache Lucene index in a folder and adds files into this index
     * based on the input of the user.
     */
    public class TextFileIndexer {
    	private static StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
    
    	private IndexWriter writer;
    	private ArrayList<File> queue = new ArrayList<File>();
    
    
    	public static void main(String[] args) throws IOException {
    		System.out.println("Enter the path where the index will be created");
    
    		String indexLocation = null;
    		BufferedReader br = new BufferedReader(
    				new InputStreamReader(System.in));
    		String s = br.readLine();
    
    		TextFileIndexer indexer = null;
    		try {
    			indexLocation = s;
    			indexer = new TextFileIndexer(s);
    		} catch (Exception ex) {
    			System.out.println("Cannot create index..." + ex.getMessage());
    			System.exit(-1);
    		}
    
    		//===================================================
    		//read input from user until he enters q for quit
    		//===================================================
    		while (!s.equalsIgnoreCase("q")) {
    			try {
    				System.out.println("Enter the full path to add into the index");
    				System.out.println("[Acceptable file types: .xml, .html, .html, .txt]");
    				s = br.readLine();
    				if (s.equalsIgnoreCase("q")) {
    					break;
    				}
    
    				//try to add file into the index
    				indexer.indexFileOrDirectory(s);
    			} catch (Exception e) {
    				System.out.println("Error indexing " + s + " : " + e.getMessage());
    			}
    		}
    
    		//===================================================
    		//call the closeIndex, otherwise the index is not created    
    		//===================================================
    		indexer.closeIndex();
    
    		//=========================================================
    		// Now search
    		//=========================================================
    		IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));
    		IndexSearcher searcher = new IndexSearcher(reader);
    		TopScoreDocCollector collector = TopScoreDocCollector.create(5, true);
    
    		s = "";
    		while (!s.equalsIgnoreCase("q")) {
    			try {
    				System.out.println("Enter the search query (q=quit):");
    				s = br.readLine();
    				if (s.equalsIgnoreCase("q")) {
    					break;
    				}
    				Query q = new QueryParser(Version.LUCENE_40, "contents", analyzer).parse(s);
    				searcher.search(q, collector);
    				ScoreDoc[] hits = collector.topDocs().scoreDocs;
    
    				// 4. display results
    				System.out.println("Found " + hits.length + " hits.");
    				for(int i=0;i<hits.length;++i) {
    					int docId = hits[i].doc;
    					Document d = searcher.doc(docId);
    					System.out.println((i + 1) + ". " + d.get("path") + " score=" + hits[i].score);
    				}
    
    			} catch (Exception e) {
    				System.out.println("Error searching " + s + " : " + e.getMessage());
    			}
    		}
    
    	}
    
    	/**
    	 * Constructor
    	 * @param indexDir the name of the folder in which the index should be created
    	 * @throws java.io.IOException when exception creating index.
    	 */
    	TextFileIndexer(String indexDir) throws IOException {
    		// the boolean true parameter means to create a new index everytime, 
    		// potentially overwriting any existing files there.
    		FSDirectory dir = FSDirectory.open(new File(indexDir));
    
    
    		IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer);
    
    		writer = new IndexWriter(dir, config);
    	}
    
    	/**
    	 * Indexes a file or directory
    	 * @param fileName the name of a text file or a folder we wish to add to the index
    	 * @throws java.io.IOException when exception
    	 */
    	public void indexFileOrDirectory(String fileName) throws IOException {
    		//===================================================
    		//gets the list of files in a folder (if user has submitted
    		//the name of a folder) or gets a single file name (is user
    		//has submitted only the file name) 
    		//===================================================
    		addFiles(new File(fileName));
    
    		int originalNumDocs = writer.numDocs();
    		for (File f : queue) {
    			FileReader fr = null;
    			FileReader frDate = null;
    			try {
    				Document ind = new Document();
    
    				//===================================================
    				// add contents of file
    				//===================================================
    				fr = new FileReader(f);
    
    				File input = new File("WSJ_0402");   //only 1 file right now
    				org.jsoup.nodes.Document doc = Jsoup.parse(input, "UTF-8", "http://example.com/");
    
    				//extracts the data from the <>'s
    				Elements DOC = doc.select("DOC");
    				Elements DOCNO = doc.select("DOCNO");
    				Elements DOCID = doc.select("DOCID");
    				Elements HEADLINE = doc.select("HL");
    				Elements DATE = doc.select("DATE");
    				Elements SOURCE = doc.select("SO");
    				Elements COMPANY = doc.select("CO");
    				Elements INDUSTRY = doc.select("IN");
    				Elements INTRODUCTION = doc.select("LP");
    				Elements ARTICLE = doc.select("TEXT");
    
    				//changes the data inside the <>'s to a string and adds the data to
    				//relevant field for each instance of DOC
    
    				for (int i=0; i<DOC.size();i++){
    
    					String linkText2 = DOCNO.get(i).toString();
    					ind.add(new StringField("DOCNO", linkText2, null));
    
    					String linkText3 = DOCID.get(i).toString();
    					ind.add(new StringField("DOCID", linkText3, null));
    
    					String linkText4 = HEADLINE.get(i).toString();
    					ind.add(new StringField("HEADLINE", linkText4, null));
    
    					String linkText5 = DATE.get(i).toString();
    					ind.add(new StringField("DATE", linkText5, null));
    
    					String linkText6 = SOURCE.get(i).toString();
    					ind.add(new StringField("SOURCE", linkText6, null));
    
    					String linkText7 = COMPANY.get(i).toString();
    					ind.add(new StringField("COMPANY", linkText7, null));
    
    					String linkText8 = INDUSTRY.get(i).toString();
    					ind.add(new StringField("INDUSTRY", linkText8, null));
    
    					String linkText9 = INTRODUCTION.get(i).toString();
    					ind.add(new StringField("INTRODUCTION", linkText9, null));
    
    					String linkText10 = ARTICLE.get(i).toString();
    					ind.add(new StringField("ARTICLE", linkText10, null));
    
    					writer.addDocument(ind);
    				}
    
    				System.out.println("Added: " + f);
    			} catch (Exception e) {
    				System.out.println("Could not add: " + f);
    			} finally {
    				fr.close();
    			}
    		}
    
    		int newNumDocs = writer.numDocs();
    		System.out.println("");
    		System.out.println("************************");
    		System.out.println(writer.numDocs() + " terms added.");
    		System.out.println("************************");
    
    		queue.clear();
    	}
    
    	private void addFiles(File file) {
    
    		if (!file.exists()) {
    			System.out.println(file + " does not exist.");
    		}
    		if (file.isDirectory()) {
    			for (File f : file.listFiles()) {
    				addFiles(f);
    			}
    		} else {
    			String filename = file.getName().toLowerCase();
    			//===================================================
    			// Only index text files
    			//===================================================
    			if (filename.endsWith("") || filename.endsWith(".html") || 
    					filename.endsWith(".xml") || filename.endsWith(".txt")) {
    				queue.add(file);
    			} else {
    				System.out.println("Skipped " + filename);
    			}
    		}
    	}
    
    	/**
    	 * Close the index.
    	 * @throws java.io.IOException when exception closing
    	 */
    	public void closeIndex() throws IOException {
    		writer.close();
    	}
    }
    Problem is though, that this adds nearly 20,000 documents (same number of total field entries) when there should be 194. Some have missing fields which adds to the problem further too.

  10. #10
    Blacky777 is offline Member
    Join Date
    Feb 2010
    Posts
    11
    Rep Power
    0

    Default Re: Lucene indexing problem

    Can somebody tell me what the null value points to in this command

    Java Code:
    ind.add(new StringField("DOCID", linkText3, null))
    It says Store stored but I'm not sure what this means? Is this maybe why all my stored fields are not connected to one another as an individual document?

Similar Threads

  1. Lucene indexing help
    By JP10 in forum Lucene
    Replies: 1
    Last Post: 07-25-2011, 06:34 PM
  2. Indexing XML using lucene
    By peliukasss in forum Lucene
    Replies: 0
    Last Post: 03-28-2010, 11:20 PM
  3. Replies: 2
    Last Post: 10-06-2008, 09:20 PM
  4. Lucene indexing ans searching code needed
    By vgarg80 in forum JavaServer Pages (JSP) and JSTL
    Replies: 0
    Last Post: 06-07-2008, 12:31 PM
  5. Lucene Re-Indexing
    By connect2srinath in forum Lucene
    Replies: 1
    Last Post: 05-11-2008, 06:35 PM

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •