Source Code Cross Referenced for HarmoniseIndexer.java in » Content-Management-System » harmonise » org » openharmonise » rm » search » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Content Management System » harmonise » org.openharmonise.rm.search
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /*
002:         * The contents of this file are subject to the 
003:         * Mozilla Public License Version 1.1 (the "License"); 
004:         * you may not use this file except in compliance with the License. 
005:         * You may obtain a copy of the License at http://www.mozilla.org/MPL/
006:         *
007:         * Software distributed under the License is distributed on an "AS IS"
008:         * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. 
009:         * See the License for the specific language governing rights and 
010:         * limitations under the License.
011:         *
012:         * The Initial Developer of the Original Code is Simulacra Media Ltd.
013:         * Portions created by Simulacra Media Ltd are Copyright (C) Simulacra Media Ltd, 2004.
014:         *
015:         * All Rights Reserved.
016:         *
017:         * Contributor(s):
018:         */
019:
020:        package org.openharmonise.rm.search;
021:
022:        import java.io.*;
023:        import java.util.*;
024:        import java.util.logging.*;
025:
026:        import javax.xml.transform.*;
027:        import javax.xml.transform.dom.*;
028:        import javax.xml.transform.stream.*;
029:
030:        import org.apache.lucene.analysis.*;
031:        import org.apache.lucene.analysis.standard.*;
032:        import org.apache.lucene.document.*;
033:        import org.apache.lucene.document.Document;
034:        import org.apache.lucene.index.*;
035:        import org.apache.lucene.queryParser.*;
036:        import org.apache.lucene.search.*;
037:        import org.apache.lucene.store.*;
038:        import org.openharmonise.commons.xml.*;
039:        import org.openharmonise.rm.*;
040:        import org.openharmonise.rm.config.*;
041:        import org.openharmonise.rm.resources.*;
042:        import org.openharmonise.rm.resources.content.*;
043:        import org.pdfbox.pdfparser.*;
044:        import org.pdfbox.pdmodel.*;
045:        import org.pdfbox.util.*;
046:        import EDU.oswego.cs.dl.util.concurrent.*;
047:
048:        /**
049:         * Interface to the Lucene text searching and indexing API for use with objects within Harmonise.
050:         * 
051:         * @author Michael Bell
052:         * @author jejking
053:         * @version $Revision: 1.4 $
054:         *
055:         */
056:        public class HarmoniseIndexer {
057:
058:            private static HarmoniseIndexer m_instance = null;
059:            private static String m_indexHome = "";
060:            private static Templates m_striptags_xsl = null;
061:            private static ArrayList keywordFieldList = new ArrayList();
062:
063:            private static final String FIELD_UNIQUEID = "uniqueid";
064:            private static final String FIELD_ID = "id";
065:            private static final String FIELD_NAME = "name";
066:            private static final String FIELD_DISPLAY_NAME = "display_name";
067:            private static final String FIELD_SUMMARY = "summary";
068:            private static final String FIELD_GROUP = "group";
069:            private static final String FIELD_CONTENTS = "contents";
070:            private static final String FIELD_CLASS = "class";
071:            private static final String INDEX_LOC_PROP = "INDEX_LOCATION";
072:            public static final String TAG_INDEXER = "indexer";
073:            public static final String TAG_INDEXABLE = "indexable";
074:            public static final String TAG_TEMPLATE = "template";
075:            public static final String TAG_COMPARISON = "comparison";
076:            public static final String TAG_INDEX = "index";
077:            public static final String ATTRIB_CLASSNAME = "classname";
078:            private static final String PNAME_STRIPTAGS_XSL = "STRIPTAGS";
079:
080:            private Executor executor;
081:
082:            /**
083:             * Logger.
084:             */
085:            private static Logger m_logger = Logger
086:                    .getLogger(HarmoniseIndexer.class.getName());
087:
088:            static {
089:                //initialise array list of key word fields
090:                keywordFieldList.add(FIELD_UNIQUEID);
091:                keywordFieldList.add(FIELD_ID);
092:                keywordFieldList.add(FIELD_GROUP);
093:            }
094:
095:            /**
096:             * Default constructor.
097:             * 
098:             * @throws HarmoniseIndexerException
099:             */
100:            private HarmoniseIndexer() throws HarmoniseIndexerException {
101:                try {
102:                    m_indexHome = ConfigSettings.getProperty(INDEX_LOC_PROP);
103:                    if ((m_indexHome == null) || (m_indexHome.length() == 0)) {
104:                        throw new HarmoniseIndexerException(
105:                                "Index location is not defined!!");
106:                    }
107:                    executor = new QueuedExecutor();
108:                } catch (Exception e) {
109:                    m_logger.log(Level.SEVERE,
110:                            "Could not instantiate HarmoniseIndexer", e);
111:                    throw new HarmoniseIndexerException(e.getMessage(), e);
112:                }
113:            }
114:
115:            /**
116:             * Returns singleton instance of <code>HarmoniseIndexer</code>.
117:             * 
118:             * @return instance of <code>HarmoniseIndexer</code>.
119:             * @throws HarmoniseIndexerException
120:             */
121:            public static HarmoniseIndexer getInstance()
122:                    throws HarmoniseIndexerException {
123:                if (m_instance == null) {
124:                    m_instance = new HarmoniseIndexer();
125:                }
126:                return m_instance;
127:            }
128:
129:            public static HarmoniseIndexer getIndexer(String indexHome)
130:                    throws HarmoniseIndexerException {
131:                if (m_instance == null) {
132:                    m_instance = new HarmoniseIndexer();
133:                }
134:                HarmoniseIndexer.m_indexHome = indexHome;
135:                return m_instance;
136:            }
137:
138:            /**
139:             * Returns <code>true</code> if the given object is indexed.
140:             * 
141:             * @param xobj
142:             * @return
143:             * @throws HarmoniseIndexerException
144:             */
145:            public static boolean isIndexed(AbstractObject xobj)
146:                    throws HarmoniseIndexerException {
147:                boolean bExists = false;
148:
149:                try {
150:                    Directory directory = FSDirectory.getDirectory(
151:                            HarmoniseIndexer.m_indexHome, false);
152:                    IndexReader reader = IndexReader.open(directory);
153:                    Term term = new Term(HarmoniseIndexer.FIELD_UNIQUEID, xobj
154:                            .getClass().getName()
155:                            + String.valueOf(xobj.getId()));
156:
157:                    if (reader.docFreq(term) > 0) {
158:                        bExists = true;
159:                    }
160:                    reader.close();
161:                } catch (FileNotFoundException e) {
162:                    bExists = false;
163:                } catch (Exception e) {
164:                    m_logger.log(Level.WARNING, e.getLocalizedMessage(), e);
165:                    throw new HarmoniseIndexerException(e.getMessage(), e);
166:                }
167:
168:                return bExists;
169:            }
170:
171:            /**
172:             * Indexes the given object.
173:             * 
174:             * @param pObj
175:             * @throws HarmoniseIndexerException
176:             */
177:            public void indexObject(AbstractObject pObj)
178:                    throws HarmoniseIndexerException {
179:                if (pObj == null || (pObj instanceof  AbstractObject) == false) {
180:                    throw new HarmoniseIndexerException(
181:                            "Object must be AbstractObject - "
182:                                    + pObj.getClass().getName());
183:                }
184:                IndexRunnable indexer = new IndexRunnable(pObj);
185:                try {
186:                    executor.execute(indexer); // hands off to single background thread
187:                } catch (InterruptedException e) {
188:                    throw new HarmoniseIndexerException(
189:                            "Problem running indexer asynchronously", e);
190:                }
191:            }
192:
193:            /**
194:             * Searches the index for objects of the type given by the <code>Class</code> and 
195:             * fulfilling the conditions given by the other arguments and returns a <code>List</code>
196:             * of object IDs.
197:             * 
198:             * @param xobj
199:             * @param groupIds
200:             * @param sName
201:             * @param sSummary
202:             * @param sContent
203:             * @return
204:             * @throws HarmoniseIndexerException
205:             */
206:            public List searchContents(Class xobjClass, Vector groupIds,
207:                    String sName, String sSummary, String sContent)
208:                    throws HarmoniseIndexerException {
209:                return searchContents(getQuery(xobjClass, groupIds, sName,
210:                        null, sSummary, sContent));
211:            }
212:
213:            /**
214:             * Returns the Lucene query string built from the conditions given for 'name', 'summary', etc.
215:             * 
216:             * @param xobjClass
217:             * @param groupIds
218:             * @param sName
219:             * @param sDisplayName
220:             * @param sSummary
221:             * @param sContent
222:             * 
223:             * @return
224:             */
225:            public String getQuery(Class xobjClass, Vector groupIds,
226:                    String sName, String sDisplayName, String sSummary,
227:                    String sContent) {
228:                StringBuffer sQuery = new StringBuffer();
229:
230:                sQuery.append(HarmoniseIndexer.FIELD_CLASS).append(":").append(
231:                        xobjClass.getName());
232:
233:                if ((groupIds != null) && (groupIds.size() > 0)) {
234:                    sQuery.append(" AND (");
235:
236:                    for (int i = 0; i < groupIds.size(); i++) {
237:                        if (i > 0) {
238:                            sQuery.append(" OR ");
239:                        }
240:
241:                        sQuery.append(HarmoniseIndexer.FIELD_GROUP).append(":")
242:                                .append(groupIds.elementAt(i));
243:                    }
244:
245:                    sQuery.append(") ");
246:                }
247:
248:                sQuery.append(" AND (");
249:
250:                boolean bOR = false;
251:
252:                // Process Name, if it has been submitted
253:                if ((sName != null) && (sName.length() > 0)) {
254:                    buildFieldQueryString(sQuery, FIELD_NAME, sName);
255:                    bOR = true;
256:                }
257:
258:                if (sDisplayName != null) {
259:                    if (bOR) {
260:                        sQuery.append(" OR ");
261:                    }
262:                    buildFieldQueryString(sQuery, FIELD_DISPLAY_NAME,
263:                            sDisplayName);
264:
265:                    bOR = true;
266:                }
267:
268:                // Process Summary, it it has been submitted
269:                if ((sSummary != null) && (sSummary.length() > 0)) {
270:                    if (bOR) {
271:                        sQuery.append(" OR ");
272:                    }
273:
274:                    buildFieldQueryString(sQuery, FIELD_SUMMARY, sSummary);
275:                    bOR = true;
276:                }
277:
278:                // Process Content, if it is been submitted
279:                if ((sContent != null) && (sContent.length() > 0)) {
280:                    if (bOR) {
281:                        sQuery.append(" OR ");
282:                    }
283:
284:                    buildFieldQueryString(sQuery, FIELD_CONTENTS, sContent);
285:                }
286:
287:                sQuery.append(")");
288:
289:                return sQuery.toString();
290:            }
291:
292:            /**
293:             * Runs the given query against the index and returns a <code>List</code> of object
294:             * IDs.
295:             * 
296:             * @param queryString
297:             * @return List of hits
298:             * @throws HarmoniseIndexerException
299:             */
300:            public List searchContents(String queryString)
301:                    throws HarmoniseIndexerException {
302:                Vector vec = new Vector();
303:
304:                if (m_logger.getLevel() == Level.FINE) {
305:                    m_logger.log(Level.FINE, "Lucene query - " + queryString);
306:                }
307:
308:                try {
309:                    Searcher searcher = new IndexSearcher(m_indexHome);
310:                    StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
311:
312:                    //need a PerFieldAnalyzerWrapper so that our PorterStem 
313:                    //analyzer isn't applied to the keywords we've set
314:                    PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(
315:                            new HarmoniseAnalyzer());
316:
317:                    Iterator iter = keywordFieldList.iterator();
318:
319:                    while (iter.hasNext()) {
320:                        String field = (String) iter.next();
321:                        analyzer.addAnalyzer(field, standardAnalyzer);
322:                    }
323:
324:                    Query query = QueryParser.parse(queryString,
325:                            FIELD_CONTENTS, analyzer);
326:
327:                    Hits hits = searcher.search(query);
328:                    if (m_logger.getLevel() == Level.FINE) {
329:                        m_logger.log(Level.FINE, "Lucene query found "
330:                                + hits.length() + " hits in the index");
331:                    }
332:
333:                    for (int i = 0; i < hits.length(); i++) {
334:                        vec.addElement(hits.doc(i).get(FIELD_ID));
335:                    }
336:
337:                    searcher.close();
338:                } catch (Exception e) {
339:                    m_logger.log(Level.WARNING, e.getLocalizedMessage(), e);
340:                }
341:                // if we encounter any errors, rather than propagating the exception up
342:                // we'll just return the vector, but it'll be empty
343:                return vec;
344:            }
345:
346:            /**
347:             * Utility method to query the Lucene index independently of the Harmonise API.
348:             * 
349:             * @param queryString a correctly formatted Lucene query to be parsed.
350:             * @return List of Strings containing some summary info about the Hits returned
351:             * @throws HarmoniseIndexerException
352:             */
353:            public List search(String queryString)
354:                    throws HarmoniseIndexerException {
355:                List hitsList = new ArrayList();
356:                Hits hits = null;
357:                try {
358:                    Searcher searcher = new IndexSearcher(m_indexHome);
359:                    Query query = QueryParser.parse(queryString,
360:                            FIELD_CONTENTS, new HarmoniseAnalyzer());
361:                    hits = searcher.search(query);
362:                    //	iterate through hits and build a list to return
363:
364:                    for (int i = 0; i < hits.length(); i++) {
365:                        StringBuffer sb = new StringBuffer();
366:                        org.apache.lucene.document.Document doc = hits.doc(i);
367:                        sb.append("Unique id: " + doc.get(FIELD_UNIQUEID));
368:                        sb.append("\n");
369:                        sb.append("Summary: " + doc.get(FIELD_SUMMARY));
370:                        sb.append("\n");
371:                        sb.append("Name: " + doc.get(FIELD_NAME));
372:                        sb.append("\n");
373:                        sb.append("Class: " + doc.get(FIELD_CLASS));
374:                        hitsList.add(sb.toString());
375:                    }
376:                    searcher.close();
377:                } catch (Exception e) {
378:                    throw new HarmoniseIndexerException(e.getMessage(), e);
379:                }
380:                return hitsList;
381:            }
382:
383:            /**
384:             * Deletes the given object from the index.
385:             * 
386:             * @param xobj
387:             * @throws HarmoniseIndexerException
388:             */
389:            public void deleteFromIndex(AbstractObject xobj)
390:                    throws HarmoniseIndexerException {
391:                DeleterRunnable deleter = new DeleterRunnable(xobj);
392:                try {
393:                    executor.execute(deleter);
394:                } catch (InterruptedException e) {
395:                    throw new HarmoniseIndexerException(
396:                            "Problem running delete asynchronously", e);
397:                }
398:            }
399:
400:            /**
401:             * Utility method to process boolean operators in the raw query string correctly into the
402:             * so that the terms are associated with the correct Lucene field name.
403:             * 
404:             * @param queryBuf StringBuffer being used to assemble the final Lucene query
405:             * @param fieldName name of the field, so we can prepend it so that Lucene field is searched
406:             * @param inputString the raw input, to be processed to make it field specific
407:             */
408:            private void buildFieldQueryString(StringBuffer queryBuf,
409:                    String fieldName, String inputString) {
410:                //just tokenise on white space as per default
411:                StringTokenizer tokeniser = new StringTokenizer(inputString);
412:
413:                // ** Use the following code to build simple phrases
414:                // ** when this is used, make sure that two successive quotes "" are replaced with 
415:                // ** one single quote or the parser falls over		
416:                boolean buildingPhrase = false;
417:                while (tokeniser.hasMoreTokens()) {
418:                    String token = tokeniser.nextToken();
419:                    token = token.replaceAll("\"\"", "\"");
420:                    if (token.equals("AND") || token.equals("OR")
421:                            || token.equals("NOT")) {
422:                        queryBuf.append(token + " "); // it's an operator, just append it raw
423:                    } else {
424:                        // are we building a phrase?
425:                        if (buildingPhrase == true) {
426:                            queryBuf.append(token + " "); // no need to prepend field:
427:                            // do we need to stop building the phrase/
428:                            if (token.endsWith("\"")) {
429:                                buildingPhrase = false;
430:                            }
431:                            continue;
432:                        }
433:                        queryBuf.append(fieldName + ":" + token + " ");
434:                        // are we going to start building a phrase ?
435:                        if (token.startsWith("\"")) {
436:                            buildingPhrase = true;
437:                        }
438:                    }
439:                }
440:            }
441:
442:            /**
443:             * Utility to add objects to the Lucene index. Extracts the indexable fields, including contents for
444:             * PDF and XML docments and writes them to the index.
445:             *  
446:             * @author John King
447:             */
448:            private class IndexRunnable implements  Runnable {
449:
450:                private AbstractObject obj;
451:                private String contents;
452:
453:                public IndexRunnable(AbstractObject obj) {
454:                    this .obj = obj;
455:                }
456:
457:                /* (non-Javadoc)
458:                 * @see java.lang.Runnable#run()
459:                 */
460:                public void run() {
461:                    contents = getContents();
462:                    Document doc = new Document(); // new Lucene document to hold details we're indexing
463:
464:                    String classname = obj.getClass().getName();
465:
466:                    doc.add(Field.Keyword(FIELD_UNIQUEID, classname
467:                            + String.valueOf(obj.getId())));
468:                    doc.add(Field.UnIndexed(FIELD_ID, String.valueOf(obj
469:                            .getId())));
470:
471:                    try {
472:                        AbstractParentObject grp = ((AbstractChildObject) obj)
473:                                .getRealParent();
474:                        if (grp != null) {
475:                            doc.add(Field.Keyword(FIELD_GROUP, String
476:                                    .valueOf(grp.getId())));
477:                        }
478:
479:                        doc.add(Field.Text(FIELD_CLASS, classname));
480:                        doc.add(Field.Text(FIELD_NAME, obj.getName()));
481:                        if (obj.getSummary() != null) {
482:                            doc
483:                                    .add(Field.Text(FIELD_SUMMARY, obj
484:                                            .getSummary()));
485:                        }
486:
487:                        if (obj instanceof  AbstractEditableObject) {
488:                            AbstractEditableObject edObj = (AbstractEditableObject) obj;
489:                            String sDispName = edObj.getDisplayName();
490:
491:                            if (sDispName != null) {
492:                                doc.add(Field.Text(FIELD_DISPLAY_NAME,
493:                                        sDispName));
494:                            }
495:                        }
496:
497:                        if (contents != null) {
498:                            doc.add(Field.Text(FIELD_CONTENTS,
499:                                    new StringReader(contents)));
500:                        }
501:
502:                        if (HarmoniseIndexer.isIndexed(obj) == true) {
503:                            Directory directory = FSDirectory.getDirectory(
504:                                    HarmoniseIndexer.m_indexHome, false);
505:
506:                            if (IndexReader.indexExists(directory)) {
507:                                IndexReader reader = IndexReader
508:                                        .open(directory);
509:                                Term term = new Term(
510:                                        HarmoniseIndexer.FIELD_UNIQUEID, obj
511:                                                .getClass().getName()
512:                                                + String.valueOf(obj.getId()));
513:                                reader.delete(term);
514:                                reader.close();
515:                            }
516:                        }
517:                        IndexWriter writer = null;
518:                        try {
519:                            writer = new IndexWriter(
520:                                    HarmoniseIndexer.m_indexHome,
521:                                    new HarmoniseAnalyzer(), false);
522:                        } catch (FileNotFoundException e) {
523:                            writer = new IndexWriter(
524:                                    HarmoniseIndexer.m_indexHome,
525:                                    new HarmoniseAnalyzer(), true);
526:                        }
527:
528:                        writer.addDocument(doc);
529:                        writer.optimize();
530:                        writer.close();
531:                        HarmoniseIndexer.m_logger.log(Level.INFO, "indexed "
532:                                + obj.getType() + ", ID: " + obj.getId());
533:                    } catch (DataAccessException e) {
534:                        HarmoniseIndexer.m_logger.log(Level.WARNING,
535:                                "Data Access Exception", e);
536:                    } catch (IOException e) {
537:                        HarmoniseIndexer.m_logger.log(Level.WARNING,
538:                                "IOException", e);
539:                    } catch (HarmoniseIndexerException e) {
540:                        HarmoniseIndexer.m_logger.log(Level.WARNING,
541:                                "Harmonise Indexer Exception", e);
542:                    }
543:                }
544:
545:                private String getContents() {
546:
547:                    String objContents = null;
548:
549:                    try {
550:                        if (obj instanceof  org.openharmonise.rm.resources.content.Document) {
551:                            org.openharmonise.rm.resources.content.Document doc = (org.openharmonise.rm.resources.content.Document) obj;
552:                            org.w3c.dom.Document xmlcontent = XMLDocument
553:                                    .getXMLDocumentFromString(doc.getContent());
554:                            objContents = getStringFromXML(xmlcontent);
555:                        } else if (obj instanceof  Asset) {
556:                            Asset asset = (Asset) obj;
557:                            if (asset.getContentType().equalsIgnoreCase(
558:                                    "application/pdf")) {
559:                                objContents = getStringFromPDF(asset
560:                                        .getContentFile());
561:                            }
562:                        }
563:                    } catch (Exception e) {
564:                        HarmoniseIndexer.m_logger.log(Level.WARNING,
565:                                "Exception", e);
566:                    }
567:                    return objContents;
568:                }
569:
570:                /**
571:                 * Returns the text content of an XML document.
572:                 * 
573:                 * @param xml
574:                 * @return String representing content of document once tags have been stripped off.
575:                 * @throws HarmoniseIndexerException
576:                 */
577:                private String getStringFromXML(org.w3c.dom.Document xml)
578:                        throws HarmoniseIndexerException {
579:
580:                    String sResult = "";
581:                    try {
582:                        if (HarmoniseIndexer.m_striptags_xsl == null) {
583:                            //get strip tags xsl if not already created
584:                            String stripFileName = ConfigSettings
585:                                    .getProperty(PNAME_STRIPTAGS_XSL);
586:
587:                            if (stripFileName != null
588:                                    && stripFileName.length() > 0) {
589:                                StreamSource ssource = new StreamSource(
590:                                        new File(stripFileName));
591:                                HarmoniseIndexer.m_striptags_xsl = (Templates) org.apache.xalan.xsltc.trax.TransformerFactoryImpl
592:                                        .newInstance().newTemplates(ssource);
593:                            }
594:                        }
595:                        //if m_striptags_xsl is null here don't do anything
596:                        if (m_striptags_xsl != null) {
597:                            Transformer trans = HarmoniseIndexer.m_striptags_xsl
598:                                    .newTransformer();
599:                            DOMSource ds = new DOMSource(xml
600:                                    .getDocumentElement());
601:                            StringWriter sw = new StringWriter();
602:                            StreamResult res = new StreamResult(sw);
603:                            trans.transform(ds, res);
604:                            sResult = sw.toString();
605:                            sw.close();
606:                        }
607:                    } catch (ConfigException e) {
608:                        throw new HarmoniseIndexerException("Config error", e);
609:                    } catch (TransformerConfigurationException e) {
610:                        throw new HarmoniseIndexerException(
611:                                "Transformer Configuration Exception", e);
612:                    } catch (TransformerFactoryConfigurationError e) {
613:                        throw new HarmoniseIndexerException(
614:                                "Transformer Factory Configuration error", e);
615:                    } catch (TransformerException e) {
616:                        throw new HarmoniseIndexerException(
617:                                "Transformer error", e);
618:                    } catch (IOException e) {
619:                        throw new HarmoniseIndexerException("IO error", e);
620:                    }
621:
622:                    return sResult;
623:                }
624:
625:                /**
626:                 * Returns the text content of a PDF file as a String.
627:                 * 
628:                 * @param pdfFile
629:                 * @return
630:                 * @throws HarmoniseIndexerException
631:                 */
632:                private String getStringFromPDF(File pdfFile)
633:                        throws HarmoniseIndexerException {
634:                    String sText = "";
635:
636:                    try {
637:                        FileInputStream pdfStream = new FileInputStream(pdfFile);
638:                        PDFParser pdfParser = new PDFParser(pdfStream);
639:                        pdfParser.parse();
640:                        PDDocument pdf = pdfParser.getPDDocument();
641:                        PDFTextStripper textstripper = new PDFTextStripper();
642:                        sText = textstripper.getText(pdf);
643:                        HarmoniseIndexer.m_logger.log(Level.FINEST, sText);
644:                        pdf.close();
645:                    } catch (FileNotFoundException e) {
646:                        throw new HarmoniseIndexerException("File not found", e);
647:                    } catch (IOException e) {
648:                        throw new HarmoniseIndexerException("IO exception", e);
649:                    }
650:
651:                    return sText;
652:                }
653:            }
654:
655:            /**
656:             * Deletion utility to remove objects from the Lucene index.
657:             * 
658:             * @author jejking
659:             */
660:            private class DeleterRunnable implements  Runnable {
661:
662:                private AbstractObject obj;
663:
664:                /**
665:                 * @param obj the Harmonise object to delete
666:                 */
667:                public DeleterRunnable(AbstractObject obj) {
668:                    if (obj == null) {
669:                        throw new NullPointerException("obj cannot be null");
670:                    }
671:                    this .obj = obj;
672:                }
673:
674:                /* (non-Javadoc)
675:                 * @see java.lang.Runnable#run()
676:                 */
677:                public void run() {
678:                    try {
679:                        Directory directory = FSDirectory.getDirectory(
680:                                HarmoniseIndexer.m_indexHome, false);
681:
682:                        if (IndexReader.indexExists(directory)) {
683:                            IndexReader reader = IndexReader.open(directory);
684:                            Term term = new Term(
685:                                    HarmoniseIndexer.FIELD_UNIQUEID, obj
686:                                            .getClass().getName()
687:                                            + String.valueOf(obj.getId()));
688:                            reader.delete(term);
689:                            reader.close();
690:                            HarmoniseIndexer.m_logger.log(Level.FINE,
691:                                    "deleted " + obj.getType() + ", ID: "
692:                                            + obj.getId() + " from index");
693:                        }
694:                    } catch (Exception e) {
695:                        HarmoniseIndexer.m_logger.log(Level.WARNING,
696:                                "problem deleting object", e);
697:                    }
698:                }
699:            }
700:
701:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.