1
2
3
4
5
6
7 package net.sf.gumshoe.indexer;
8
9 import java.io.File;
10 import java.io.FileNotFoundException;
11 import java.io.IOException;
12 import java.io.Reader;
13 import java.util.List;
14
15 import org.apache.lucene.document.DateField;
16 import org.apache.lucene.document.Document;
17 import org.apache.lucene.document.Field;
18 import org.ccil.cowan.tagsoup.Parser;
19 import org.xml.sax.InputSource;
20 import org.xml.sax.SAXException;
21 import org.xml.sax.XMLReader;
22
23 /***
24 * @author Gabor
25 *
26 * TODO To change the template for this generated type comment go to
27 * Window - Preferences - Java - Code Style - Code Templates
28 */
29 public abstract class ContentReader {
30 public static final String MODIFIED = "modified";
31 public static final String FILENAME = "filename";
32 public static final String FILECATEGORY = "filecategory";
33 public static final String CONTENTS = "contents";
34
35 /*** Generate index entry for file
36 * @param f file to be indexed
37 * @return index entry
38 * @throws Exception
39 */
40 public abstract Document getDocument(File f) throws Exception;
41
42 /*** List extensions supported by this reader
43 * @return list of extensions
44 */
45 public abstract List getSupportedExtensions();
46
47 /*** Category for this reader. Currently all readers return empty string.
48 * Later this will allow developing a "type" based interface with
49 * special search fields for various categories, e.g. an email would
50 * have a from, to, cc, subject, etc. fields.
51 *
52 * @return category (like email, document, etc.)
53 */
54 public abstract String getCategory();
55
56 /*** Convenience method, adds default fields to index entry
57 * @param f file to be indexed
58 * @param doc index entry
59 * @throws IOException
60 */
61 protected void addDefaultFields(File f, Document doc, String category) throws IOException {
62 doc.add(Field.Keyword(FILENAME, f.getCanonicalPath()));
63 doc.add(Field.Keyword(FILECATEGORY, category));
64 doc.add(Field.Keyword(MODIFIED, DateField.timeToString(f.lastModified())));
65 }
66 /*** Convenience method to process XML content
67 * @param input an XML reader
68 * @return a Reader containing all content for this XML input
69 * @throws IOException
70 * @throws SAXException
71 * @throws FileNotFoundException
72 */
73 protected Reader getContentFromXML(Reader input) throws IOException, SAXException, FileNotFoundException {
74 XMLReader r=new Parser();
75 XMLSAXHandler ch=new XMLSAXHandler();
76 r.setContentHandler(ch);
77 r.parse(new InputSource(input));
78 return ch.getContent();
79 }
80 }