001: /*
002:
003: * LIUS - Lucene Index Update and Search
004: * http://sourceforge.net/projects/lius/
005: *
006: * Copyright (c) 2005, Laval University Library. All rights reserved.
007: *
008: * This library is free software; you can redistribute it and/or
009: * modify it under the terms of the GNU Lesser General Public
010: * License as published by the Free Software Foundation; either
011: * version 2.1 of the License, or (at your option) any later version.
012: *
013: * This library is distributed in the hope that it will be useful,
014: * but WITHOUT ANY WARRANTY; without even the implied warranty of
015: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
016: * Lesser General Public License for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public
019: * License along with this library; if not, write to the Free Software
020: * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
021: */
022:
023: package ca.ulaval.bibl.lius.index.XML;
024:
025: import java.io.File;
026: import java.io.IOException;
027: import java.lang.reflect.InvocationTargetException;
028: import java.util.ArrayList;
029: import java.util.Collection;
030: import java.util.Iterator;
031: import java.util.List;
032: import java.util.Map;
033:
034: import org.apache.commons.beanutils.BeanUtils;
035: import org.apache.log4j.Logger;
036: import org.jaxen.JaxenException;
037: import org.jaxen.SimpleNamespaceContext;
038: import org.jaxen.jdom.JDOMXPath;
039: import org.jdom.Attribute;
040: import org.jdom.CDATA;
041: import org.jdom.Comment;
042: import org.jdom.Element;
043: import org.jdom.EntityRef;
044: import org.jdom.JDOMException;
045: import org.jdom.Namespace;
046: import org.jdom.ProcessingInstruction;
047: import org.jdom.Text;
048: import org.jdom.input.SAXBuilder;
049:
050: import ca.ulaval.bibl.lius.Lucene.LuceneActions;
051: import ca.ulaval.bibl.lius.config.LiusConfig;
052: import ca.ulaval.bibl.lius.config.LiusConfigBuilder;
053: import ca.ulaval.bibl.lius.config.LiusField;
054: import ca.ulaval.bibl.lius.index.Indexer;
055:
056: /**
057: *
058: * Classe se basant sur JDOM et XPATH pour indexer des fichiers XML.
059: *
060: * <br/><br/>
061: *
062: * Class based on JDOM and XPATH for indexing XML files.
063: *
064: * @author Rida Benjelloun (rida.benjelloun@bibl.ulaval.ca)
065: *
066: */
067:
068: public class XmlFileIndexer
069:
070: extends Indexer {
071:
072: private SimpleNamespaceContext nsc = new SimpleNamespaceContext();
073:
074: private String toIndex = null;
075:
076: static Logger logger = Logger.getRootLogger();
077:
078: /**
079: *
080: * Permet de parser un fichier XML et de retourner un objet de type JDOM
081: * Document.
082: *
083: * <br/><br/>
084: *
085: * Parse an XML file and returns a JDOM object.
086: *
087: */
088:
089: public Object parse(Object file) {
090:
091: org.jdom.Document xmlDoc = new org.jdom.Document();
092:
093: try {
094:
095: SAXBuilder builder = new SAXBuilder();
096:
097: builder.setValidation(false);
098:
099: xmlDoc = builder.build(new File((String) file));
100:
101: }
102:
103: catch (JDOMException e) {
104:
105: logger.error(e.getMessage());
106:
107: }
108:
109: catch (IOException e) {
110:
111: logger.error(e.getMessage());
112:
113: }
114:
115: return xmlDoc;
116:
117: }
118:
119: /**
120: *
121: * Méthode retournant un objet de type "Lucene document" à partir du fichier
122: *
123: * XML à indexer et du fichier de configuration de Lius exprimé sous forme
124: *
125: * d'objet de type "LiusConfig".
126: *
127: * <br/><br/>
128: *
129: * Method that returns Lucene Document object from an XML file to index and
130: *
131: * the Lius configuration file as a LiusConfig object.
132: *
133: *
134: *
135: */
136:
137: public org.apache.lucene.document.Document createLuceneDocument(
138: String
139:
140: xmlFile, LiusConfig lc) {
141:
142: Collection liusFields = lc.getXmlFileFields();
143:
144: org.apache.lucene.document.Document luceneDoc = createLuceneDocument(
145:
146: xmlFile,
147:
148: liusFields);
149:
150: return luceneDoc;
151:
152: }
153:
154: /**
155: *
156: * Permet de placer un noeud XML dans un document Lucene. Chaque élément du
157: *
158: * noeud est indexé en se basant sur une collection d'objets de type
159: * "LiusField"
160: *
161: * qui contient le nom du champs dans lucene, l'expression Xpath pour
162: * séléctionner
163: *
164: * le noeud et le séparateur d'occurences si on veut placer les différentes
165: *
166: * occurences d'un même élément dans le même champs; dans la cas contraire
167: * chaque
168: *
169: * élément sera placé dans son propre champs.
170: *
171: * <br/><br/>
172: *
173: * Place an XML node in a Lucene document. Each element of the node is
174: * indexed
175: *
176: * based on a collection of type LiusField containing the name of the field
177: * in
178: *
179: * Lucene, the XPath expression to select the node and the hits separator if
180: * we want
181: *
182: * differents hits of a same element in the same field; in the other case
183: * each
184: *
185: * element is placed in is own field.
186: *
187: */
188:
189: public org.apache.lucene.document.Document storeNodeInLuceneDocument(
190: Object
191:
192: xmlDoc, Collection liusFields) {
193:
194: Collection resColl = getPopulatedCollection(xmlDoc, liusFields);
195:
196: org.apache.lucene.document.Document luceneDoc = LuceneActions.
197:
198: getSingletonInstance().populateLuceneDoc(resColl);
199:
200: return luceneDoc;
201:
202: }
203:
204: /*
205: * private void populateField(LiusField newField, LiusField lf) {
206: *
207: * newField.setName(lf.getName());
208: *
209: * newField.setType(lf.getType());
210: *
211: * newField.setXpathSelect(lf.getXpathSelect());
212: *
213: * newField.setDateFormat(lf.getDateFormat());
214: * }
215: */
216:
217: /**
218: *
219: * Méthode permettant de concaténer les occurences multiples d'un élément
220: * qui
221: *
222: * vont être stockées dans le même document Lucene.
223: *
224: * <br/><br/>
225: *
226: * Method that concatenates multiple hist of an element which will be saved
227: * in
228: *
229: * the same Lucene document.
230: *
231: */
232:
233: public String concatOccurance(Object xmlDoc,
234:
235: String xpath,
236:
237: String concatSep) {
238:
239: StringBuffer chaineConcat = new StringBuffer();
240:
241: try {
242:
243: JDOMXPath xp = new JDOMXPath(xpath);
244:
245: xp.setNamespaceContext(nsc);
246:
247: List ls = xp.selectNodes(xmlDoc);
248:
249: Iterator i = ls.iterator();
250:
251: int j = 0;
252:
253: while (i.hasNext()) {
254:
255: j++;
256:
257: String text = "";
258:
259: Object obj = (Object) i.next();
260:
261: if (obj instanceof Element) {
262:
263: Element elem = (Element) obj;
264:
265: text = elem.getText().trim();
266:
267: }
268:
269: else if (obj instanceof Attribute) {
270:
271: Attribute att = (Attribute) obj;
272:
273: text = att.getValue().trim();
274:
275: }
276:
277: else if (obj instanceof Text) {
278:
279: Text txt = (Text) obj;
280:
281: text = txt.getText().trim();
282:
283: }
284:
285: else if (obj instanceof CDATA) {
286:
287: CDATA cdata = (CDATA) obj;
288:
289: text = cdata.getText().trim();
290:
291: }
292:
293: else if (obj instanceof Comment) {
294:
295: Comment com = (Comment) obj;
296:
297: text = com.getText().trim();
298:
299: }
300:
301: else if (obj instanceof ProcessingInstruction) {
302:
303: ProcessingInstruction pi = (ProcessingInstruction) obj;
304:
305: text = pi.getData().trim();
306:
307: }
308:
309: else if (obj instanceof EntityRef) {
310:
311: EntityRef er = (EntityRef) obj;
312:
313: text = er.toString().trim();
314:
315: }
316:
317: if (text != "") {
318:
319: if (ls.size() == 1) {
320:
321: chaineConcat.append(text);
322:
323: return chaineConcat.toString().trim();
324:
325: }
326:
327: else {
328:
329: if (ls.size() == j)
330:
331: chaineConcat.append(text);
332:
333: else
334:
335: chaineConcat.append(text + " " + concatSep
336: + " ");
337:
338: }
339:
340: }
341:
342: }
343:
344: }
345:
346: catch (JaxenException j) {
347:
348: logger.error(j.getMessage());
349:
350: }
351:
352: return chaineConcat.toString().trim();
353:
354: }
355:
356: public void setFileName(String toIndex) {
357:
358: this .toIndex = toIndex;
359:
360: }
361:
362: public String getFileName() {
363:
364: return toIndex;
365:
366: }
367:
368: /**
369: *
370: * Retourne une collection contenant les champs avec les valeurs à indexer
371: *
372: * comme par exemple: le texte integral, titre etc.
373: *
374: * <br/><br/>
375: *
376: * Returns a collection containing the fields with the values to index; like :
377: *
378: * full text, title, etc.
379: *
380: */
381:
382: public Collection getPopulatedCollection(Object xmlFile,
383: Collection liusField) {
384:
385: Object xmlDoc = null;
386:
387: List documentNs = null;
388:
389: Map hm = null;
390:
391: boolean nsTrouve = false;
392:
393: boolean isMap = false;
394:
395: Collection resColl = new ArrayList();
396:
397: if (xmlFile instanceof org.jdom.Document ||
398:
399: xmlFile instanceof org.jdom.Element)
400:
401: xmlDoc = xmlFile;
402:
403: else
404:
405: xmlDoc = parse(xmlFile);
406:
407: if (xmlDoc instanceof org.jdom.Document) {
408:
409: documentNs = getAllDocumentNs((org.jdom.Document) xmlDoc);
410:
411: }
412:
413: Iterator itColl = liusField.iterator();
414:
415: while (itColl.hasNext()) {
416:
417: Object colElem = itColl.next();
418:
419: if (colElem instanceof Map) {
420:
421: isMap = true;
422:
423: hm = (Map) colElem;
424:
425: for (int j = 0; j < documentNs.size(); j++) {
426:
427: Collection liusFields = (Collection) hm
428: .get(documentNs.get(j));
429:
430: if (liusFields != null) {
431:
432: nsTrouve = true;
433:
434: extractDataFromElements(xmlDoc, liusFields,
435: resColl);
436:
437: }
438:
439: }
440:
441: }
442:
443: if (nsTrouve == false && (colElem instanceof Map)) {
444:
445: extractDataFromElements(xmlDoc, (Collection) hm
446: .get("default"), resColl);
447:
448: }
449:
450: }
451:
452: if (isMap == false)
453:
454: extractDataFromElements(xmlDoc, liusField, resColl);
455:
456: return resColl;
457:
458: }
459:
460: public Collection getPopulatedCollection(Object file,
461: String liusConfig) {
462:
463: LiusConfig lc = LiusConfigBuilder.getSingletonInstance()
464: .getLiusConfig(
465:
466: liusConfig);
467:
468: return getPopulatedCollection(file, lc);
469:
470: }
471:
472: public Collection getPopulatedCollection(Object file, LiusConfig lc) {
473:
474: return getPopulatedCollection(file, lc.getXmlFileFields());
475:
476: }
477:
478: private void extractDataFromElements(Object xmlDoc,
479: Collection liusFields,
480:
481: Collection resColl) {
482:
483: Iterator it = liusFields.iterator();
484:
485: while (it.hasNext()) {
486:
487: Object field = it.next();
488:
489: if (field instanceof LiusField) {
490:
491: LiusField lf = (LiusField) field;
492:
493: if (lf.getOcurSep() != null) {
494:
495: String cont = concatOccurance(xmlDoc, lf
496: .getXpathSelect(),
497:
498: lf.getOcurSep());
499:
500: lf.setValue(cont);
501:
502: resColl.add(lf);
503:
504: }
505:
506: else {
507:
508: try {
509:
510: JDOMXPath xp = new JDOMXPath(lf
511: .getXpathSelect());
512:
513: xp.setNamespaceContext(nsc);
514:
515: List selectList = xp.selectNodes(xmlDoc);
516:
517: Iterator i = selectList.iterator();
518:
519: while (i.hasNext()) {
520:
521: LiusField lfoccur = new LiusField();
522:
523: BeanUtils.copyProperties(lfoccur, lf);
524:
525: Object selection = (Object) i.next();
526:
527: if (selection instanceof Element) {
528:
529: Element elem = (Element) selection;
530:
531: if (elem.getText().trim() != null &&
532:
533: elem.getText().trim() != "") {
534:
535: lfoccur.setValue(elem.getText());
536:
537: resColl.add(lfoccur);
538:
539: }
540:
541: }
542:
543: else if (selection instanceof Attribute) {
544:
545: Attribute att = (Attribute) selection;
546:
547: lf.setValue(att.getValue());
548:
549: resColl.add(lf);
550:
551: }
552:
553: else if (selection instanceof Text) {
554:
555: Text text = (Text) selection;
556:
557: lf.setValue(text.getText());
558:
559: resColl.add(lf);
560:
561: }
562:
563: else if (selection instanceof CDATA) {
564:
565: CDATA cdata = (CDATA) selection;
566:
567: lf.setValue(cdata.getText());
568:
569: resColl.add(lf);
570:
571: }
572:
573: else if (selection instanceof Comment) {
574:
575: Comment com = (Comment) selection;
576:
577: lf.setValue(com.getText());
578:
579: resColl.add(lf);
580:
581: }
582:
583: else if (selection instanceof ProcessingInstruction) {
584:
585: ProcessingInstruction pi = (ProcessingInstruction)
586:
587: selection;
588:
589: lf.setValue(pi.getData());
590:
591: resColl.add(lf);
592:
593: }
594:
595: else if (selection instanceof EntityRef) {
596:
597: EntityRef er = (EntityRef) selection;
598:
599: lf.setValue(er.toString());
600:
601: resColl.add(lf);
602:
603: }
604:
605: }
606:
607: }
608:
609: catch (JaxenException e) {
610:
611: logger.error(e.getMessage());
612:
613: }
614:
615: catch (InvocationTargetException ex) {
616:
617: logger.error(ex.getMessage());
618:
619: }
620:
621: catch (IllegalAccessException ex) {
622:
623: logger.error(ex.getMessage());
624:
625: }
626:
627: }
628:
629: }
630:
631: else {
632:
633: resColl.add(field);
634:
635: }
636:
637: }
638:
639: }
640:
641: public List getAllDocumentNs(org.jdom.Document doc) {
642:
643: List ls = new ArrayList();
644:
645: processChildren(doc.getRootElement(), ls);
646:
647: return ls;
648:
649: }
650:
651: private boolean exist(List nsLs, String nsUri) {
652:
653: if (nsLs.isEmpty())
654:
655: return false;
656:
657: for (int i = 0; i < nsLs.size(); i++) {
658:
659: if (((String) nsLs.get(i)).equals(nsUri)) {
660:
661: return true;
662:
663: }
664:
665: }
666:
667: return false;
668:
669: }
670:
671: private void processChildren(Element elem, List ns) {
672:
673: Namespace nsCourent = (Namespace) elem.getNamespace();
674:
675: String nsUri = (nsCourent.getURI());
676:
677: if (!exist(ns, nsUri)) {
678:
679: ns.add(nsUri.trim());
680:
681: nsc.addNamespace(nsCourent.getPrefix(), nsCourent.getURI());
682:
683: }
684:
685: List additionalNs = elem.getAdditionalNamespaces();
686:
687: if (!additionalNs.isEmpty())
688:
689: copyNsList(additionalNs, ns);
690:
691: if (elem.getChildren().size() > 0) {
692:
693: List elemChildren = elem.getChildren();
694:
695: for (int i = 0; i < elemChildren.size(); i++) {
696:
697: processChildren((Element) elemChildren.get(i), ns);
698:
699: }
700:
701: }
702:
703: }
704:
705: private void copyNsList(List nsElem, List nsRes) {
706:
707: for (int i = 0; i < nsElem.size(); i++) {
708:
709: Namespace ns = (Namespace) nsElem.get(i);
710:
711: nsc.addNamespace(ns.getPrefix(), ns.getURI());
712:
713: nsRes.add(ns.getURI().trim());
714:
715: }
716:
717: }
718:
719: /**
720: *
721: * Permet de récupérer les champs de Lius à partir du fichier de
722: * configuration
723: *
724: * pour effectuer l'indexation.
725: *
726: * <br/><br/>
727: *
728: * Get Lius fields from the configuration file for indexing.
729: *
730: */
731:
732: public Collection getLiusFields(LiusConfig lc) {
733:
734: return lc.getXmlFileFields();
735:
736: }
737:
738: }
|