001: /**********************************************************************************
002: *
003: * Copyright (c) 2003, 2004 The Regents of the University of Michigan, Trustees of Indiana University,
004: * Board of Trustees of the Leland Stanford, Jr., University, and The MIT Corporation
005: *
006: * Licensed under the Educational Community License Version 1.0 (the "License");
007: * By obtaining, using and/or copying this Original Work, you agree that you have read,
008: * understand, and will comply with the terms and conditions of the Educational Community License.
009: * You may obtain a copy of the License at:
010: *
011: * http://cvs.sakaiproject.org/licenses/license_1_0.html
012: *
013: * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
014: * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE
015: * AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
016: * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
017: * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
018: *
019: **********************************************************************************/package edu.indiana.lib.twinpeaks.search.singlesearch.web2;
020:
021: import java.util.Iterator;
022: import java.util.regex.Matcher;
023: import java.util.regex.Pattern;
024:
025: import org.w3c.dom.Document;
026: import org.w3c.dom.Element;
027: import org.w3c.dom.NodeList;
028:
029: import edu.indiana.lib.osid.base.repository.http.CreatorPartStructure;
030: import edu.indiana.lib.osid.base.repository.http.DOIPartStructure;
031: import edu.indiana.lib.osid.base.repository.http.DataSource;
032: import edu.indiana.lib.osid.base.repository.http.DatePartStructure;
033: import edu.indiana.lib.osid.base.repository.http.EditionPartStructure;
034: import edu.indiana.lib.osid.base.repository.http.EndPagePartStructure;
035: import edu.indiana.lib.osid.base.repository.http.InLineCitationPartStructure;
036: import edu.indiana.lib.osid.base.repository.http.IsnIdentifierPartStructure;
037: import edu.indiana.lib.osid.base.repository.http.IssuePartStructure;
038: import edu.indiana.lib.osid.base.repository.http.LanguagePartStructure;
039: import edu.indiana.lib.osid.base.repository.http.PagesPartStructure;
040: import edu.indiana.lib.osid.base.repository.http.PublisherPartStructure;
041: import edu.indiana.lib.osid.base.repository.http.SourceTitlePartStructure;
042: import edu.indiana.lib.osid.base.repository.http.StartPagePartStructure;
043: import edu.indiana.lib.osid.base.repository.http.SubjectPartStructure;
044: import edu.indiana.lib.osid.base.repository.http.TypePartStructure;
045: import edu.indiana.lib.osid.base.repository.http.URLPartStructure;
046: import edu.indiana.lib.osid.base.repository.http.VolumePartStructure;
047: import edu.indiana.lib.osid.base.repository.http.YearPartStructure;
048: import edu.indiana.lib.twinpeaks.search.MatchItem;
049: import edu.indiana.lib.twinpeaks.search.QueryBase;
050: import edu.indiana.lib.twinpeaks.search.SearchResultBase;
051: import edu.indiana.lib.twinpeaks.util.DomUtils;
052: import edu.indiana.lib.twinpeaks.util.LogUtils;
053: import edu.indiana.lib.twinpeaks.util.SearchException;
054: import edu.indiana.lib.twinpeaks.util.SessionContext;
055: import edu.indiana.lib.twinpeaks.util.StatusUtils;
056: import edu.indiana.lib.twinpeaks.util.StringUtils;
057:
058: /**
059: * Parse the Web2 XML response
060: */
061: public class Web2Response extends SearchResultBase {
062: private static org.apache.commons.logging.Log _log = LogUtils
063: .getLog(Web2Response.class);
064:
065: private SessionContext sessionContext;
066:
067: /**
068: * Constructor
069: */
070: public Web2Response() {
071: super ();
072: }
073:
074: /**
075: * Save various attributes of the general search request
076: *
077: * @param query
078: * The QueryBase extension that sent the search request
079: */
080: public void initialize(QueryBase query) {
081: super .initialize(query);
082:
083: sessionContext = SessionContext.getInstance(_sessionId);
084: }
085:
086: /**
087: * Parse the search engine response as XML Overrides
088: * <code>SearchResultBase#parseResponse()</code>
089: *
090: * @return Response as a DOM Document
091: */
092: protected Document parseResponse() throws SearchException {
093: try {
094: return DomUtils.parseXmlBytes(_searchResponseBytes);
095: } catch (Exception exception) {
096: throw new SearchException(exception.toString());
097: }
098: }
099:
100: /**
101: * Parse the response
102: */
103: public void doParse() {
104: Document responseDocument = getSearchResponseDocument();
105: Element resultElement;
106: NodeList recordList;
107:
108: /*
109: * Examine each RECORD
110: */
111: resultElement = DomUtils.getElement(responseDocument
112: .getDocumentElement(), "RESULTS");
113: recordList = DomUtils.getElementList(resultElement, "RECORD");
114:
115: for (int i = 0; i < recordList.getLength(); i++) {
116: MatchItem item;
117: Element dataElement, recordElement;
118: NodeList nodeList;
119: String title, description;
120: String database, hit, target;
121: String recordId, recordType;
122: String content;
123:
124: /*
125: * Skip status RECORD elements
126: */
127: recordElement = (Element) recordList.item(i);// gets the record
128: // number(1-10)
129: recordType = recordElement.getAttribute("type");
130:
131: if (!StringUtils.isNull(recordType)) {
132: /*
133: * Error?
134: */
135: if (recordType.equalsIgnoreCase("error")) {
136: Element element;
137: String status, text;
138:
139: status = recordElement.getAttribute("status");
140: element = DomUtils
141: .getElement(recordElement, "DATA");
142: text = DomUtils.getText(element);
143:
144: if (StringUtils.isNull(status)) {
145: status = "<unknown>";
146: }
147:
148: if (text == null) {
149: text = "";
150: }
151:
152: StatusUtils.setGlobalError(sessionContext, status,
153: text);
154:
155: _log.error("Error RECORD found");
156: displayXml(recordElement);
157:
158: throw new SearchException(status);
159: }
160: /*
161: * Not an error, just note it and ignore
162: */
163: _log.debug("Skipping RECORD with non-null TYPE \""
164: + recordType + "\"");
165: continue;
166: }
167: /*
168: * Pick up the database name & related information
169: */
170: hit = recordElement.getAttribute("hit");
171: target = recordElement.getAttribute("sourceID");
172: database = recordElement.getAttribute("source");
173: recordId = recordElement.getAttribute("identifier");
174: /*
175: * Update hit count
176: */
177: StatusUtils.updateHits(sessionContext, target);
178: /*
179: * The information we want resides in the DATA portion of the
180: * document
181: */
182: if ((dataElement = DomUtils.getElement(recordElement,
183: "DATA")) == null) {
184: _log
185: .error("No DATA element present in server response");
186: displayXml(recordElement);
187: throw new SearchException(
188: "Missing mandatory <DATA> element in server response");
189: }
190:
191: title = getText(dataElement, "TITLE");
192: if (StringUtils.isNull(title)) {
193: _log.warn("No TITLE text in server response");
194: displayXml(recordElement);
195: title = "";
196: }
197:
198: description = getText(dataElement, "DESCRIPTION");
199: if (StringUtils.isNull(description)) {
200: _log.warn("No DESCRIPTION text in server response");
201: description = "";
202: }
203: /*
204: * Save select search result data
205: */
206: item = new MatchItem();
207: /*
208: * Title, abstract, record ID
209: */
210: _log.debug("Adding TITLE: " + title);
211:
212: item.setDisplayName(title);
213: item.setDescription(description);
214: item.setId(recordId);
215: /*
216: * Publisher, language
217: */
218: addPartStructure(dataElement, "PUBLICATION", item,
219: PublisherPartStructure.getPartStructureId());
220:
221: addPartStructure(dataElement, "LANGUAGE", item,
222: LanguagePartStructure.getPartStructureId());
223: /*
224: * In-line Citation information
225: */
226:
227: if (!addPartStructure(dataElement, "CITATION", item,
228: InLineCitationPartStructure.getPartStructureId())) {
229:
230: if (!addPartStructure(dataElement, "SOURCE", item,
231: InLineCitationPartStructure
232: .getPartStructureId())) {
233:
234: if (!addPartStructure(dataElement, "DESCRIPTION",
235: item, InLineCitationPartStructure
236: .getPartStructureId())) {
237:
238: addPartStructure(dataElement, "TITLE", item,
239: InLineCitationPartStructure
240: .getPartStructureId());
241: }
242:
243: }
244:
245: }
246:
247: /*
248: * Title, volume, issue
249: */
250: if (!addPartStructure(dataElement,
251: "CITATION-JOURNAL-TITLE", item,
252: SourceTitlePartStructure.getPartStructureId())) {
253: addPartStructure(dataElement, "SOURCE", item,
254: SourceTitlePartStructure.getPartStructureId());
255: }
256:
257: addPartStructure(dataElement, "CITATION-VOLUME", item,
258: VolumePartStructure.getPartStructureId());
259:
260: addPartStructure(dataElement, "CITATION-ISSUE", item,
261: IssuePartStructure.getPartStructureId());
262:
263: addPartStructure(dataElement, "CITATION-PART", item,
264: EditionPartStructure.getPartStructureId());
265: /*
266: * Pages
267: */
268: addPartStructure(dataElement, "CITATION-PAGES", item,
269: PagesPartStructure.getPartStructureId());
270:
271: addPartStructure(dataElement, "CITATION-START-PAGE", item,
272: StartPagePartStructure.getPartStructureId());
273:
274: addPartStructure(dataElement, "CITATION-END-PAGE", item,
275: EndPagePartStructure.getPartStructureId());
276: /*
277: * Date and Year
278: */
279: addPartStructure(dataElement, "CITATION-DATE", item,
280: DatePartStructure.getPartStructureId());
281:
282: if (!addPartStructure(dataElement, "CITATION-DATE-YEAR",
283: item, YearPartStructure.getPartStructureId())) {
284: addPartStructure(dataElement, "CITATION-DATE", item,
285: YearPartStructure.getPartStructureId());
286: }
287: /*
288: * Type of publication
289: */
290: if (!addPartStructure(dataElement, "TYPE", item,
291: TypePartStructure.getPartStructureId())) {
292: if (!addPartStructure(dataElement, "PUBLICATION-TYPE",
293: item, TypePartStructure.getPartStructureId())) {
294: if (getText(dataElement, "CITATION-JOURNAL-TITLE") != null) {
295: item.addPartStructure(TypePartStructure
296: .getPartStructureId(), "Journal");
297: }
298: }
299: }
300: /*
301: * URL
302: */
303: addPartStructure(dataElement, "URL", item, URLPartStructure
304: .getPartStructureId());
305: /*
306: * Identifiers (ISSN, ISBN, DOI)
307: */
308: addPartStructure(dataElement, "ISBN", item,
309: IsnIdentifierPartStructure.getPartStructureId());
310:
311: addPartStructure(dataElement, "ISSN", item,
312: IsnIdentifierPartStructure.getPartStructureId());
313:
314: if (!addPartStructure(dataElement, "CITATION-DOI", item,
315: DOIPartStructure.getPartStructureId())) {
316: addPartStructure(dataElement, "DOI", item,
317: DOIPartStructure.getPartStructureId());
318: }
319: /*
320: * Author (add each in turn)
321: */
322: addPartStructureList(dataElement, "AUTHOR", item,
323: CreatorPartStructure.getPartStructureId());
324: /*
325: * Subject (add each)
326: */
327: addPartStructureList(dataElement, "SUBJECT", item,
328: SubjectPartStructure.getPartStructureId());
329:
330: doRegexParse(database, item);
331:
332: /*
333: * Save the asset component we just created
334: */
335:
336: addItem(item);
337:
338: }
339: }
340:
341: /**
342: * This method does its best to map data contained in an inLineCitation to
343: * other fields such as volume, issue, etc. in the case that they are empty.
344: * It compares the citation to a known set of regular expressions contained
345: * in REGULAR_EXPRESSION. Adding a new regular expression entails adding a
346: * new case for parsing in this method.
347: *
348: * @param citation
349: * inLineCitation to be parsed
350: */
351:
352: private void doRegexParse(String database, MatchItem item) {
353: Pattern pattern;
354: Matcher matcher;
355:
356: boolean hasVolume = false;
357: boolean hasIssue = false;
358: boolean hasDate = false;
359: boolean hasYear = false;
360: boolean hasStartPage = false;
361: boolean hasEndPage = false;
362: boolean hasSourceTitle = false;
363:
364: try {
365: String citation;
366: DataSource dataSource;
367: boolean regExpFound;
368:
369: citation = (String) ((MatchItem.PartPair) getPartPair(
370: InLineCitationPartStructure.getPartStructureId(),
371: item)).getValue();
372: dataSource = new DataSource(database, citation);
373:
374: if (!dataSource.findRegExp()) {
375: return;
376: }
377:
378: hasVolume = recordHasPart(VolumePartStructure
379: .getPartStructureId(), item);
380:
381: hasIssue = recordHasPart(IssuePartStructure
382: .getPartStructureId(), item);
383:
384: hasDate = recordHasPart(DatePartStructure
385: .getPartStructureId(), item);
386:
387: hasYear = recordHasPart(YearPartStructure
388: .getPartStructureId(), item);
389:
390: hasStartPage = recordHasPart(StartPagePartStructure
391: .getPartStructureId(), item);
392:
393: hasEndPage = recordHasPart(EndPagePartStructure
394: .getPartStructureId(), item);
395:
396: hasSourceTitle = recordHasPart(SourceTitlePartStructure
397: .getPartStructureId(), item);
398:
399: if (!hasVolume) {
400: pattern = Pattern.compile(dataSource.getVolumeToken());
401: matcher = pattern.matcher(citation);
402: if (matcher.find()) {
403: addPartStructure(item, VolumePartStructure
404: .getInstance().getId(), matcher.group());
405: }
406: }
407:
408: if (!hasIssue) {
409: pattern = Pattern.compile(dataSource.getIssueToken());
410: matcher = pattern.matcher(citation);
411: if (matcher.find()) {
412: addPartStructure(item, IssuePartStructure
413: .getInstance().getId(), matcher.group()
414: .replaceAll("\\D", ""));
415: }
416: }
417:
418: if (!hasDate) {
419: pattern = Pattern.compile(dataSource.getDateToken());
420: matcher = pattern.matcher(citation);
421:
422: if (matcher.find()) {
423: String date = matcher.group().substring(
424: dataSource.getReplaceStartToken(),
425: matcher.group().length()
426: - dataSource.getReplaceEndToken());
427: addPartStructure(item, DatePartStructure
428: .getInstance().getId(), date);
429: }
430: }
431:
432: if (!hasYear) {
433: pattern = Pattern.compile(dataSource.getYearToken());
434: matcher = pattern.matcher(citation);
435:
436: if (matcher.find()) {
437: String year = matcher.group().substring(
438: dataSource.getReplaceStartToken(),
439: matcher.group().length()
440: - dataSource.getReplaceEndToken());
441: addPartStructure(item, YearPartStructure
442: .getInstance().getId(), year);
443: }
444: }
445:
446: if (!hasStartPage || !hasEndPage) {
447: pattern = Pattern.compile(dataSource.getPagesToken());
448: matcher = pattern.matcher(citation);
449: if (matcher.find()) {
450: createPagesPart(matcher.group(), item);
451: }
452: }
453:
454: if (!hasSourceTitle) {
455: pattern = Pattern.compile(dataSource
456: .getSourceTitleToken());
457: matcher = pattern.matcher(citation);
458: if (matcher.find()) {
459: String sourceTitle = matcher.group().substring(0,
460: matcher.group().length() - 1);
461: addPartStructure(item, SourceTitlePartStructure
462: .getInstance().getId(), sourceTitle);
463: }
464: }
465:
466: } catch (org.osid.repository.RepositoryException e) {
467: _log.warn("doRegexParse() failed", e);
468: }
469: }
470:
471: private void createPagesPart(String text, MatchItem item)
472: throws org.osid.repository.RepositoryException {
473: try {
474: if (text == null || text.equals(""))
475: return;
476: else if (text.charAt(0) == ',') {
477: // getting a poorly formatted field
478: return;
479: }
480:
481: addPartStructure(item, PagesPartStructure.getInstance()
482: .getId(), text);
483:
484: // get start and end page if possible
485: String[] pages = text.split("-");
486:
487: if (pages.length == 0) {
488: // cannot create start/end page.
489: return;
490: }
491:
492: String spage = pages[0].trim();
493:
494: // delete all non-digit chars (ie: p., pp., etc.)
495: spage = spage.replaceAll("\\D", "");
496: _log
497: .debug("======================&&&& Start page: spage &&&================");
498:
499: // create startPage part
500: addPartStructure(item, StartPagePartStructure.getInstance()
501: .getId(), spage);
502:
503: // end page
504: if (pages.length == 2) {
505: String epage = pages[1].trim();
506: epage = epage.replaceAll("\\D", "");
507: addPartStructure(item, EndPagePartStructure
508: .getInstance().getId(), epage);
509: }
510: } catch (StringIndexOutOfBoundsException e) {
511: _log.warn("createPagesPart()", e);
512: }
513: }
514:
515: /**
516: * This method searches the current record for a Part using its
517: * PartStructure Type.
518: *
519: * @param partStructureId
520: * PartStructure Type of Part you need.
521: * @return the Part if it exists in the current record, null if it does not.
522: */
523: private boolean recordHasPart(org.osid.shared.Id partStructureId,
524: MatchItem item) {
525:
526: if (this .getPartPair(partStructureId, item) == null) {
527: return false;
528: } else {
529: return true;
530: }
531:
532: }
533:
534: private MatchItem.PartPair getPartPair(
535: org.osid.shared.Id partStructureId, MatchItem item) {
536: Iterator partPairIterator = item.partPairIterator();
537: while (partPairIterator.hasNext()) {
538: MatchItem.PartPair partPair = (MatchItem.PartPair) partPairIterator
539: .next();
540: if (partPair.getId().equals(partStructureId)) {
541:
542: return partPair;
543: }
544:
545: }
546:
547: return null;
548: }
549:
550: /*
551: * Helpers
552: */
553:
554: /**
555: * Locate (and save as PartStructure id/value pairs) all matching items
556: *
557: * @param rootElement
558: * Start looking here
559: * @param partDataName
560: * Name of the XML element we're looking for
561: * @param item
562: * Current MatchItem (eg Asset)
563: * @param id
564: * Part ID
565: * @return true if PartStructure data was added, false if none found
566: */
567: private boolean addPartStructureList(Element parentElement,
568: String partDataName, MatchItem item, org.osid.shared.Id id) {
569: NodeList nodeList = DomUtils.getElementList(parentElement,
570: partDataName);
571: boolean partsAdded = false;
572:
573: for (int i = 0; i < nodeList.getLength(); i++) {
574: Element element = (Element) nodeList.item(i);
575: String text = DomUtils.getText(element);
576:
577: if (!StringUtils.isNull(text)) {
578: addPartStructure(item, id, text);
579: partsAdded = true;
580: }
581: }
582: return partsAdded;
583: }
584:
585: /**
586: * Save (add new) PartStructure data
587: *
588: * @param item
589: * Current MatchItem (eg Asset)
590: * @param id
591: * Part ID
592: * @param value
593: * Part value
594: * @return true If Part data was added, false if no data was found
595: */
596: private boolean addPartStructure(MatchItem item,
597: org.osid.shared.Id id, String value) {
598: boolean partAdded = false;
599: String text = value;
600:
601: if (text != null) {
602: text = text.trim();
603: }
604:
605: if (!StringUtils.isNull(text)) {
606: item.addPartStructure(id, text);
607: partAdded = true;
608: }
609: return partAdded;
610: }
611:
612: /**
613: * Locate (in response XML) and save PartStructure data
614: *
615: * @param parentElement
616: * Parent Element - the search starts here
617: * @param partDataName
618: * The name of the child element where Part data is found
619: * @param item
620: * Current MatchItem (eg Asset)
621: * @param id
622: * Part ID
623: * @return true If Part data was added, false if no data was found
624: */
625: private boolean addPartStructure(Element parentElement,
626: String partDataName, MatchItem item, org.osid.shared.Id id) {
627: String value = getText(parentElement, partDataName);
628:
629: return addPartStructure(item, id, value);
630: }
631:
632: /**
633: * Locate text
634: *
635: * @param parent
636: * Search from here
637: * @param name
638: * Find this element
639: * @return Text (null if none)
640: */
641: private String getText(Element parent, String name) {
642: Element element = DomUtils.getElement(parent, name);
643: String text = null;
644:
645: if (element != null) {
646: text = DomUtils.getText(element);
647: }
648: return text;
649: }
650:
651: /**
652: * Display XML (with optional warning header)
653: *
654: * @param errorText
655: * Error message (null for none)
656: * @param recordElement
657: * The XML object to disolay (Document, Element)
658: */
659: private static void displayXml(String errorText, Object xmlObject) {
660:
661: try {
662: LogUtils.displayXml(_log, errorText, xmlObject);
663: } catch (Exception ignore) {
664: }
665: }
666:
667: /**
668: * Display XML information
669: *
670: * @param xmlObject
671: * XML to display (Document, Element)
672: */
673: private void displayXml(Object xmlObject) {
674:
675: try {
676: LogUtils.displayXml(_log, xmlObject);
677: } catch (Exception ignore) {
678: }
679: }
680: }
|