001: package org.sakaibrary.osid.repository.xserver;
002:
003: import java.io.IOException;
004: import java.util.regex.Matcher;
005: import java.util.regex.Pattern;
006:
007: import javax.xml.parsers.ParserConfigurationException;
008:
009: import org.sakaibrary.xserver.session.MetasearchSession;
010: import org.sakaibrary.xserver.session.MetasearchSessionManager;
011: import org.xml.sax.SAXException;
012: import org.xml.sax.SAXParseException;
013:
014: /**
015: * @author gbhatnag
016: * @version
017: */
018: public class AssetIterator extends org.xml.sax.helpers.DefaultHandler
019: implements org.osid.repository.AssetIterator {
020:
021: private static final long serialVersionUID = 1L;
022: private static final String REGULAR_EXPRESSION_FILE = "/data/citationRegex.txt";
023: private static final org.apache.commons.logging.Log LOG = org.apache.commons.logging.LogFactory
024: .getLog("org.sakaibrary.osid.repository.xserver.AssetIterator");
025:
026: private java.util.LinkedList assetQueue;
027: private java.util.ArrayList regexArray;
028: private String guid;
029: private int totalRecordsCursor = 0;
030: private int numRecordsReturned = 0;
031: private org.osid.shared.Id repositoryId;
032: private org.osid.shared.Id recordStructureId;
033: private org.osid.repository.Asset asset;
034: private org.osid.repository.Record record;
035:
036: // for SAX parsing
037: private StringBuffer textBuffer;
038:
039: // session
040: private MetasearchSessionManager msm;
041: org.osid.shared.Properties statusProperties;
042:
043: /**
044: * Constructs an empty AssetIterator
045: *
046: * @param guid globally unique identifier for this session
047: * @throws org.osid.repository.RepositoryException
048: */
049: protected AssetIterator(String guid)
050: throws org.osid.repository.RepositoryException {
051: this .guid = guid;
052:
053: // get session cache manager
054: msm = MetasearchSessionManager.getInstance();
055:
056: // create assetQueue
057: assetQueue = new java.util.LinkedList();
058:
059: // load citation regular expressions
060: try {
061: regexArray = loadCitationRegularExpressions(REGULAR_EXPRESSION_FILE);
062: } catch (java.io.IOException ioe) {
063: LOG.warn("AssetIterator() failed reading citation regular "
064: + "expressions - regex file: "
065: + REGULAR_EXPRESSION_FILE, ioe);
066: }
067: }
068:
069: private java.util.ArrayList loadCitationRegularExpressions(
070: String filename) throws java.io.IOException {
071: java.util.ArrayList regexArray = new java.util.ArrayList();
072:
073: java.io.InputStream is = this .getClass().getResourceAsStream(
074: filename);
075: java.io.BufferedReader regexes = new java.io.BufferedReader(
076: new java.io.InputStreamReader(is));
077:
078: // read the regex file and add regexes to array
079: String regex;
080: while ((regex = regexes.readLine()) != null) {
081: String[] nameRegex = regex.split("=");
082:
083: CitationRegex citationRegex = new CitationRegex();
084: citationRegex.setName(nameRegex[0].trim());
085: citationRegex.setRegex(nameRegex[1].trim());
086:
087: regexArray.add(citationRegex);
088: }
089: regexes.close();
090: is.close();
091:
092: return regexArray;
093: }
094:
095: public boolean hasNextAsset()
096: throws org.osid.repository.RepositoryException {
097: MetasearchSession metasearchSession = msm
098: .getMetasearchSession(guid);
099:
100: // get an XServer to check status and update number of records found
101: org.sakaibrary.xserver.XServer xserver = null;
102: statusProperties = null;
103: try {
104: xserver = new org.sakaibrary.xserver.XServer(guid);
105: xserver.updateSearchStatusProperties();
106: statusProperties = xserver.getSearchStatusProperties();
107: } catch (org.sakaibrary.xserver.XServerException xse) {
108: LOG.warn("X-Server error: " + xse.getErrorCode() + " - "
109: + xse.getErrorText());
110:
111: // throw exception now that status has been updated
112: throw new org.osid.repository.RepositoryException(
113: org.sakaibrary.osid.repository.xserver.MetasearchException.METASEARCH_ERROR);
114: }
115:
116: // check status for error/timeout
117: String status = null;
118:
119: try {
120: status = (String) statusProperties.getProperty("status");
121: } catch (org.osid.shared.SharedException se) {
122: LOG.warn("hasNextAsset() failed getting status "
123: + "property", se);
124: }
125:
126: if (status != null) {
127: // status and statusMessage are set by XServer.updateSearchStatusProperties
128: if (status.equals("error")) {
129: throw new org.osid.repository.RepositoryException(
130: org.sakaibrary.osid.repository.xserver.MetasearchException.METASEARCH_ERROR);
131: } else if (status.equals("timeout")) {
132: throw new org.osid.repository.RepositoryException(
133: org.sakaibrary.osid.repository.xserver.MetasearchException.SESSION_TIMED_OUT);
134: } else if (status.equals("empty")) {
135: // no records found
136: return false;
137: }
138: } else {
139: LOG.warn("hasNextAsset() - status property is null");
140: }
141:
142: // get updated metasearchSession
143: metasearchSession = msm.getMetasearchSession(guid);
144: Integer numRecordsFound = metasearchSession
145: .getNumRecordsFound();
146:
147: if (numRecordsFound == null || numRecordsFound.intValue() == 0) {
148: // still searching for records, return true
149: return true;
150: }
151:
152: // check if passed max number of attainable records
153: int maxAttainable;
154: boolean gotMergeError = metasearchSession.isGotMergeError();
155: if (gotMergeError) {
156: maxAttainable = 300;
157: } else {
158: maxAttainable = numRecordsFound.intValue();
159: }
160:
161: return (numRecordsReturned < maxAttainable);
162: }
163:
164: public org.osid.repository.Asset nextAsset()
165: throws org.osid.repository.RepositoryException {
166: LOG.debug("nextAsset() [entry] - returned: "
167: + numRecordsReturned + "; total: " + totalRecordsCursor
168: + "; in queue: " + assetQueue.size());
169:
170: // return Asset, if ready
171: if (assetQueue.size() > 0) {
172: numRecordsReturned++;
173: return (org.osid.repository.Asset) assetQueue.removeFirst();
174: }
175:
176: // assetQueue is empty - check whether we should get more records
177: // or throw an Exception
178: if (hasNextAsset()) {
179: // hasNextAsset() will throw timeout/error Exceptions if any
180: String status = null;
181:
182: try {
183: status = (String) statusProperties
184: .getProperty("status");
185: } catch (org.osid.shared.SharedException se) {
186: LOG.warn("nextAsset() failed getting status property",
187: se);
188: }
189:
190: if (!status.equals("ready")) {
191: // the X-Server is still searching/fetching - try again later
192: throw new org.osid.repository.RepositoryException(
193: org.sakaibrary.osid.repository.xserver.MetasearchException.ASSET_NOT_FETCHED);
194: }
195:
196: // get records from the X-Server
197: MetasearchSession metasearchSession = msm
198: .getMetasearchSession(guid);
199: org.osid.shared.Id repositoryId = metasearchSession
200: .getRepositoryId();
201:
202: try {
203: org.sakaibrary.xserver.XServer xserver = new org.sakaibrary.xserver.XServer(
204: guid);
205:
206: LOG
207: .debug("nextAsset() calling XServer.getRecordsXML() - assets in "
208: + "queue: " + assetQueue.size());
209: createAssets(xserver.getRecordsXML(totalRecordsCursor),
210: repositoryId);
211: } catch (org.sakaibrary.xserver.XServerException xse) {
212: LOG.warn("X-Server error: " + xse.getErrorCode()
213: + " - " + xse.getErrorText());
214:
215: throw new org.osid.repository.RepositoryException(
216: org.sakaibrary.osid.repository.xserver.MetasearchException.METASEARCH_ERROR);
217: }
218: LOG
219: .debug("nextAsset(), XServer.getRecordsXML() returns - assets in "
220: + "queue: " + assetQueue.size());
221:
222: // records have been fetched and Assets queued
223: totalRecordsCursor += assetQueue.size();
224: numRecordsReturned++;
225: return (org.osid.repository.Asset) assetQueue.removeFirst();
226: } else {
227: // no assets available
228: throw new org.osid.repository.RepositoryException(
229: org.osid.shared.SharedException.NO_MORE_ITERATOR_ELEMENTS);
230: }
231: }
232:
233: /**
234: * This method parses the xml StringBuffer and creates Assets, Records
235: * and Parts in the Repository with the given repositoryId.
236: *
237: * @param xml input xml in "sakaibrary" format
238: * @param log the log being used by the Repository
239: * @param repositoryId the Id of the Repository in which to create Assets,
240: * Records and Parts.
241: *
242: * @throws org.osid.repository.RepositoryException
243: */
244: private void createAssets(java.io.ByteArrayInputStream xml,
245: org.osid.shared.Id repositoryId)
246: throws org.osid.repository.RepositoryException {
247: this .repositoryId = repositoryId;
248: recordStructureId = RecordStructure.getInstance().getId();
249: textBuffer = new StringBuffer();
250:
251: // use a SAX parser
252: javax.xml.parsers.SAXParserFactory factory;
253: javax.xml.parsers.SAXParser saxParser;
254:
255: // set up the parser
256: factory = javax.xml.parsers.SAXParserFactory.newInstance();
257: factory.setNamespaceAware(true);
258:
259: // start parsing
260: try {
261: saxParser = factory.newSAXParser();
262: saxParser.parse(xml, this );
263: xml.close();
264: } catch (SAXParseException spe) {
265: // Use the contained exception, if any
266: Exception x = spe;
267:
268: if (spe.getException() != null) {
269: x = spe.getException();
270: }
271:
272: // Error generated by the parser
273: LOG.warn("createAssets() parsing exception: "
274: + spe.getMessage() + " - xml line "
275: + spe.getLineNumber() + ", uri "
276: + spe.getSystemId(), x);
277: } catch (SAXException sxe) {
278: // Error generated by this application
279: // (or a parser-initialization error)
280: Exception x = sxe;
281:
282: if (sxe.getException() != null) {
283: x = sxe.getException();
284: }
285:
286: LOG.warn("createAssets() SAX exception: "
287: + sxe.getMessage(), x);
288: } catch (ParserConfigurationException pce) {
289: // Parser with specified options can't be built
290: LOG.warn("createAssets() SAX parser cannot be built with "
291: + "specified options");
292: } catch (IOException ioe) {
293: // I/O error
294: LOG.warn("createAssets() IO exception", ioe);
295: }
296: }
297:
298: //----------------------------------
299: // SAX DEFAULT HANDLER IMPLEMENTATIONS -
300: //----------------------------------
301:
302: /**
303: * Receive notification of the beginning of an element.
304: *
305: * @see DefaultHandler
306: */
307: public void startElement(String namespaceURI, String sName,
308: String qName, org.xml.sax.Attributes attrs)
309: throws org.xml.sax.SAXException {
310: if (qName.equals("record")) {
311: populateAssetFromText("record_start");
312: }
313: }
314:
315: /**
316: * Receive notification of the end of an element.
317: *
318: * @see DefaultHandler
319: */
320: public void endElement(String namespaceURI, String sName,
321: String qName) throws org.xml.sax.SAXException {
322: populateAssetFromText(qName);
323: }
324:
325: /**
326: * Receive notification of character data inside an element.
327: *
328: * @see DefaultHandler
329: */
330: public void characters(char[] buf, int offset, int len)
331: throws org.xml.sax.SAXException {
332: // store character data
333: String text = new String(buf, offset, len);
334:
335: if (textBuffer == null) {
336: textBuffer = new StringBuffer(text);
337: } else {
338: textBuffer.append(text);
339: }
340: }
341:
342: private void populateAssetFromText(String elementName) {
343: // new record
344: if (elementName.equals("record_start")) {
345: try {
346: // create a new asset... need title, description, assetId
347: asset = new Asset(null, null, getId(), repositoryId);
348:
349: // create a new record
350: record = asset.createRecord(recordStructureId);
351: } catch (org.osid.repository.RepositoryException re) {
352: LOG.warn("populateAssetFromText() failed to "
353: + "create new Asset/Record pair.", re);
354: }
355: } else if (elementName.equals("record")) {
356: // a record has ended: do post-processing //
357:
358: // set dateRetrieved
359: setDateRetrieved();
360:
361: // use inLineCitation to fill in other fields, if possible
362: org.osid.repository.Part inLineCitation;
363: try {
364: if ((inLineCitation = recordHasPart(InLineCitationPartStructure
365: .getInstance().getType())) != null) {
366: doRegexParse((String) inLineCitation.getValue());
367: }
368: } catch (org.osid.repository.RepositoryException re) {
369: LOG.warn("populateAssetFromText() failed to "
370: + "gracefully process inLineCitation value.",
371: re);
372: }
373:
374: assetQueue.add(asset);
375: }
376:
377: if (textBuffer == null) {
378: return;
379: }
380:
381: String text = textBuffer.toString().trim();
382: if (text.equals("")) {
383: return;
384: }
385:
386: try {
387: if (elementName.equals("title")) {
388: asset.updateDisplayName(text);
389: } else if (elementName.equals("abstract")) {
390: asset.updateDescription(text);
391: } else if (elementName.equals("author")) {
392: record.createPart(CreatorPartStructure.getInstance()
393: .getId(), text);
394: } else if (elementName.equals("date")) {
395: record.createPart(DatePartStructure.getInstance()
396: .getId(), text);
397: } else if (elementName.equals("doi")) {
398: record.createPart(DOIPartStructure.getInstance()
399: .getId(), text);
400: } else if (elementName.equals("edition")) {
401: record.createPart(EditionPartStructure.getInstance()
402: .getId(), text);
403: } else if (elementName.equals("inLineCitation")) {
404: record.createPart(InLineCitationPartStructure
405: .getInstance().getId(), text);
406: } else if (elementName.equals("isnIdentifier")) {
407: record.createPart(IsnIdentifierPartStructure
408: .getInstance().getId(), text);
409: } else if (elementName.equals("issue")) {
410: record.createPart(IssuePartStructure.getInstance()
411: .getId(), text);
412: } else if (elementName.equals("language")) {
413: record.createPart(LanguagePartStructure.getInstance()
414: .getId(), text);
415: } else if (elementName.equals("note")) {
416: record.createPart(NotePartStructure.getInstance()
417: .getId(), text);
418: } else if (elementName.equals("openUrl")) {
419: record.createPart(OpenUrlPartStructure.getInstance()
420: .getId(), text);
421: } else if (elementName.equals("pages")) {
422: createPagesPart(text);
423: } else if (elementName.equals("publisherInfo")) {
424: record.createPart(PublisherPartStructure.getInstance()
425: .getId(), text);
426: } else if (elementName.equals("rights")) {
427: record.createPart(RightsPartStructure.getInstance()
428: .getId(), text);
429: } else if (elementName.equals("sourceTitle")) {
430: record.createPart(SourceTitlePartStructure
431: .getInstance().getId(), text);
432: } else if (elementName.equals("subject")) {
433: record.createPart(SubjectPartStructure.getInstance()
434: .getId(), text);
435: } else if (elementName.equals("type")) {
436: record.createPart(TypePartStructure.getInstance()
437: .getId(), text);
438: } else if (elementName.equals("url")) {
439: record.createPart(URLPartStructure.getInstance()
440: .getId(), text);
441: } else if (elementName.equals("urlLabel")) {
442: record.createPart(URLLabelPartStructure.getInstance()
443: .getId(), text);
444: } else if (elementName.equals("urlFormat")) {
445: record.createPart(URLFormatPartStructure.getInstance()
446: .getId(), text);
447: } else if (elementName.equals("volume")) {
448: record.createPart(VolumePartStructure.getInstance()
449: .getId(), text);
450: } else if (elementName.equals("volumeIssue")) {
451: doRegexParse(text);
452: } else if (elementName.equals("year")) {
453: record.createPart(YearPartStructure.getInstance()
454: .getId(), text);
455: }
456: } catch (org.osid.repository.RepositoryException re) {
457: LOG.warn("populateAssetFromText() failed to "
458: + "create new Part.", re);
459: }
460:
461: textBuffer = null;
462: }
463:
464: private void setDateRetrieved() {
465: java.util.GregorianCalendar now = new java.util.GregorianCalendar();
466: int month = now.get(java.util.Calendar.MONTH) + 1;
467: int date = now.get(java.util.Calendar.DATE);
468: String monthStr, dateStr;
469:
470: if (month < 10) {
471: monthStr = "0" + month;
472: } else {
473: monthStr = String.valueOf(month);
474: }
475:
476: if (date < 10) {
477: dateStr = "0" + date;
478: } else {
479: dateStr = String.valueOf(date);
480: }
481: String dateRetrieved = now.get(java.util.Calendar.YEAR) + "-"
482: + monthStr + "-" + dateStr;
483:
484: try {
485: record.createPart(DateRetrievedPartStructure.getInstance()
486: .getId(), dateRetrieved);
487: } catch (org.osid.repository.RepositoryException re) {
488: LOG.warn("setDateRetrieved() failed "
489: + "creating new dateRetrieved Part.", re);
490: }
491: }
492:
493: /**
494: * This method searches the current record for a Part using its
495: * PartStructure Type.
496: *
497: * @param partStructureType PartStructure Type of Part you need.
498: * @return the Part if it exists in the current record, null if it does not.
499: */
500: private org.osid.repository.Part recordHasPart(
501: org.osid.shared.Type partStructureType) {
502: try {
503: org.osid.repository.PartIterator pit = record.getParts();
504:
505: while (pit.hasNextPart()) {
506: org.osid.repository.Part part = pit.nextPart();
507:
508: if (part.getPartStructure().getType().isEqual(
509: partStructureType)) {
510: return part;
511: }
512: }
513: } catch (org.osid.repository.RepositoryException re) {
514: LOG.warn("recordHasPart() failed getting Parts.", re);
515: }
516:
517: // did not find the Part
518: return null;
519: }
520:
521: /**
522: * This method does its best to map data contained in an inLineCitation to
523: * other fields such as volume, issue, etc. in the case that they are empty.
524: * It compares the citation to a known set of regular expressions contained
525: * in REGULAR_EXPRESSION_FILE. Adding a new regular expression entails
526: * adding a new case for parsing in this method.
527: *
528: * @param citation inLineCitation to be parsed
529: */
530: private void doRegexParse(String citation) {
531: String regexName = null;
532: Pattern pattern;
533: Matcher matcher;
534: boolean hasVolume = false;
535: boolean hasIssue = false;
536: boolean hasDate = false;
537: boolean hasPages = false;
538: boolean hasSourceTitle = false;
539:
540: for (int i = 0; i < regexArray.size(); i++) {
541: CitationRegex citationRegex = (CitationRegex) regexArray
542: .get(i);
543: pattern = Pattern.compile(citationRegex.getRegex());
544: matcher = pattern.matcher(citation);
545:
546: if (matcher.find()) {
547: regexName = citationRegex.getName();
548: break;
549: }
550: }
551:
552: if (regexName != null) {
553: // determine which fields are necessary
554: try {
555: hasVolume = recordHasPart(VolumePartStructure
556: .getInstance().getType()) == null ? false
557: : true;
558:
559: hasIssue = recordHasPart(IssuePartStructure
560: .getInstance().getType()) == null ? false
561: : true;
562:
563: hasDate = recordHasPart(DatePartStructure.getInstance()
564: .getType()) == null ? false : true;
565:
566: hasPages = recordHasPart(PagesPartStructure
567: .getInstance().getType()) == null ? false
568: : true;
569:
570: hasSourceTitle = recordHasPart(SourceTitlePartStructure
571: .getInstance().getType()) == null ? false
572: : true;
573:
574: // if all true, no need to go further
575: if (hasVolume && hasIssue && hasDate && hasPages
576: && hasSourceTitle) {
577: return;
578: }
579:
580: // check for matching regex
581: if (regexName.equals("zooRec")) {
582: // .+ \d+(\(\d+\))?, (.*)? \d{4}: \d+-\d+
583: if (!hasVolume) {
584: pattern = Pattern.compile("\\d+");
585: matcher = pattern.matcher(citation);
586: if (matcher.find()) {
587: record.createPart(VolumePartStructure
588: .getInstance().getId(), matcher
589: .group());
590: }
591: }
592:
593: if (!hasIssue) {
594: pattern = Pattern.compile("\\(\\d+\\)");
595: matcher = pattern.matcher(citation);
596: if (matcher.find()) {
597: record.createPart(IssuePartStructure
598: .getInstance().getId(), matcher
599: .group().replaceAll("\\D", ""));
600: }
601: }
602:
603: if (!hasDate) {
604: pattern = Pattern.compile(", (.*)? \\d{4}:");
605: matcher = pattern.matcher(citation);
606: if (matcher.find()) {
607: String date = matcher.group().substring(2,
608: matcher.group().length() - 1);
609: record.createPart(DatePartStructure
610: .getInstance().getId(), date);
611: }
612: }
613:
614: if (!hasPages) {
615: pattern = Pattern.compile("\\d+-\\d+");
616: matcher = pattern.matcher(citation);
617: if (matcher.find()) {
618: createPagesPart(matcher.group());
619: }
620: }
621:
622: if (!hasSourceTitle) {
623: pattern = Pattern.compile("\\D+\\d");
624: matcher = pattern.matcher(citation);
625: if (matcher.find()) {
626: String sourceTitle = matcher
627: .group()
628: .substring(
629: 0,
630: matcher.group().length() - 2);
631: record
632: .createPart(
633: SourceTitlePartStructure
634: .getInstance()
635: .getId(),
636: sourceTitle);
637: }
638: }
639: } else if (regexName.equals("animBehavAbs")) {
640: // .+ Vol\. \d+, no\. \d+, (\d+)? pp\.|p\. \d+(-\d+.)? (.*)? \d{4}\.$
641: if (!hasVolume) {
642: pattern = Pattern.compile("Vol\\. \\d+");
643: matcher = pattern.matcher(citation);
644: if (matcher.find()) {
645: record.createPart(VolumePartStructure
646: .getInstance().getId(), matcher
647: .group().replaceAll("\\D", ""));
648: }
649: }
650:
651: if (!hasIssue) {
652: pattern = Pattern.compile("no\\. \\d+");
653: matcher = pattern.matcher(citation);
654: if (matcher.find()) {
655: record.createPart(IssuePartStructure
656: .getInstance().getId(), matcher
657: .group().replaceAll("\\D", ""));
658: }
659: }
660:
661: if (!hasDate) {
662: pattern = Pattern
663: .compile("(pp\\.|p\\.) \\d+(-\\d+\\.)? (.*)? \\d{4}\\.$");
664: matcher = pattern.matcher(citation);
665: if (matcher.find()) {
666: String date = matcher
667: .group()
668: .substring(
669: matcher.group().indexOf(
670: " ", 4) + 1,
671: matcher.group().length() - 1);
672: record.createPart(DatePartStructure
673: .getInstance().getId(), date);
674: }
675: }
676:
677: if (!hasPages) {
678: pattern = Pattern
679: .compile("(pp\\.|p\\.) \\d+(-\\d+\\.)?");
680: matcher = pattern.matcher(citation);
681: if (matcher.find()) {
682: createPagesPart(matcher.group());
683: }
684: }
685:
686: if (!hasSourceTitle) {
687: pattern = Pattern.compile(".+ \\[");
688: matcher = pattern.matcher(citation);
689: if (matcher.find()) {
690: String sourceTitle = matcher
691: .group()
692: .substring(
693: 0,
694: matcher.group().length() - 2);
695: record
696: .createPart(
697: SourceTitlePartStructure
698: .getInstance()
699: .getId(),
700: sourceTitle);
701: }
702: }
703: } else if (regexName.equals("pubMed")) {
704: // .+ (Volume: \\d+, )?Issue: ((\\d+)|(\\w+)), Date: \\d{4} \\d+ \\d+,( Pages: \\d+-\\d+)?
705: if (!hasVolume) {
706: pattern = Pattern.compile("Volume: \\d+");
707: matcher = pattern.matcher(citation);
708: if (matcher.find()) {
709: record.createPart(VolumePartStructure
710: .getInstance().getId(), matcher
711: .group().replaceAll("\\D", ""));
712: }
713: }
714:
715: if (!hasIssue) {
716: pattern = Pattern
717: .compile("Issue: ((\\d+)|(\\w+))");
718: matcher = pattern.matcher(citation);
719: if (matcher.find()) {
720: String issue = matcher.group().substring(7,
721: matcher.group().length());
722: record.createPart(IssuePartStructure
723: .getInstance().getId(), issue);
724: }
725: }
726:
727: if (!hasDate) {
728: pattern = Pattern
729: .compile("Date: \\d{4} \\d+ \\d+");
730: matcher = pattern.matcher(citation);
731: if (matcher.find()) {
732: String date = matcher.group().substring(6,
733: matcher.group().length());
734: date = date.replaceAll("\\s", "-");
735: record.createPart(DatePartStructure
736: .getInstance().getId(), date);
737: }
738: }
739:
740: if (!hasPages) {
741: pattern = Pattern.compile("\\d+-\\d+");
742: matcher = pattern.matcher(citation);
743: if (matcher.find()) {
744: createPagesPart(matcher.group());
745: }
746: }
747:
748: if (!hasSourceTitle) {
749: pattern = Pattern.compile(".+\\. Vol");
750: matcher = pattern.matcher(citation);
751: if (matcher.find()) {
752: String sourceTitle = matcher
753: .group()
754: .substring(
755: 0,
756: matcher.group().length() - 5);
757: record
758: .createPart(
759: SourceTitlePartStructure
760: .getInstance()
761: .getId(),
762: sourceTitle);
763: }
764: }
765: } else if (regexName.equals("isiWos")) {
766: // ^\d+( \(\d+\))?: \w+-.+(.+)?( \w{3})?( \w{3}-\w{3})?( \d+)? \d{4}$
767: if (!hasVolume) {
768: pattern = Pattern.compile("^\\d+");
769: matcher = pattern.matcher(citation);
770: if (matcher.find()) {
771: record.createPart(VolumePartStructure
772: .getInstance().getId(), matcher
773: .group());
774: }
775: }
776:
777: if (!hasIssue) {
778: pattern = Pattern.compile("\\(\\d+\\)");
779: matcher = pattern.matcher(citation);
780: if (matcher.find()) {
781: record.createPart(IssuePartStructure
782: .getInstance().getId(), matcher
783: .group().replaceAll("\\D", ""));
784: }
785: }
786:
787: if (!hasDate) {
788: pattern = Pattern
789: .compile("( \\w{3})?( \\w{3}-\\w{3})?( \\d+)? \\d{4}$");
790: matcher = pattern.matcher(citation);
791: if (matcher.find()) {
792: record.createPart(DatePartStructure
793: .getInstance().getId(), matcher
794: .group().trim());
795: }
796: }
797:
798: if (!hasPages) {
799: pattern = Pattern.compile(" \\w+(-\\w+)?");
800: matcher = pattern.matcher(citation);
801: if (matcher.find()) {
802: createPagesPart(matcher.group().trim());
803: }
804: }
805: } else if (regexName.equals("jstor")) {
806: // .+, Vol\. \d+(, No\. \d+)?
807: if (!hasVolume) {
808: pattern = Pattern.compile("Vol\\. \\d+");
809: matcher = pattern.matcher(citation);
810: if (matcher.find()) {
811: record.createPart(VolumePartStructure
812: .getInstance().getId(), matcher
813: .group().replaceAll("\\D", ""));
814: }
815: }
816:
817: if (!hasIssue) {
818: pattern = Pattern.compile("No\\. \\d+");
819: matcher = pattern.matcher(citation);
820: if (matcher.find()) {
821: record.createPart(IssuePartStructure
822: .getInstance().getId(), matcher
823: .group().replaceAll("\\D", ""));
824: }
825: }
826:
827: if (!hasSourceTitle) {
828: pattern = Pattern.compile(".+, Vol");
829: matcher = pattern.matcher(citation);
830: if (matcher.find()) {
831: String sourceTitle = matcher
832: .group()
833: .substring(
834: 0,
835: matcher.group().length() - 5);
836: record
837: .createPart(
838: SourceTitlePartStructure
839: .getInstance()
840: .getId(),
841: sourceTitle);
842: }
843: }
844: } else if (regexName.equals("eric")) {
845: // ^v\d+ n|v\d+ p\d+-\d+( \w{3})?( \w{3}-\w{3})?( \d+)? \d{4}$
846: if (!hasVolume) {
847: pattern = Pattern.compile("^v\\d+");
848: matcher = pattern.matcher(citation);
849: if (matcher.find()) {
850: record.createPart(VolumePartStructure
851: .getInstance().getId(), matcher
852: .group().replaceAll("\\D", ""));
853: }
854: }
855:
856: if (!hasIssue) {
857: pattern = Pattern.compile(" (n|v)\\d+");
858: matcher = pattern.matcher(citation);
859: if (matcher.find()) {
860: record.createPart(IssuePartStructure
861: .getInstance().getId(), matcher
862: .group().trim().replaceAll("\\D",
863: ""));
864: }
865: }
866:
867: if (!hasDate) {
868: pattern = Pattern
869: .compile("( \\w{3})?( \\w{3}-\\w{3})?( \\d+)? \\d{4}$");
870: matcher = pattern.matcher(citation);
871: if (matcher.find()) {
872: record.createPart(DatePartStructure
873: .getInstance().getId(), matcher
874: .group().trim());
875: }
876: }
877:
878: if (!hasPages) {
879: pattern = Pattern.compile("\\d+-\\d+");
880: matcher = pattern.matcher(citation);
881: if (matcher.find()) {
882: createPagesPart(matcher.group());
883: }
884: }
885: } else if (regexName.equals("proquest")) {
886: // ^\d+; \d+(; .+)?
887: if (!hasVolume) {
888: pattern = Pattern.compile("^\\d+");
889: matcher = pattern.matcher(citation);
890: if (matcher.find()) {
891: record.createPart(VolumePartStructure
892: .getInstance().getId(), matcher
893: .group());
894: }
895: }
896:
897: if (!hasIssue) {
898: pattern = Pattern.compile("; \\d+");
899: matcher = pattern.matcher(citation);
900: if (matcher.find()) {
901: record.createPart(IssuePartStructure
902: .getInstance().getId(), matcher
903: .group().replaceAll("\\D", ""));
904: }
905: }
906:
907: if (!hasSourceTitle) {
908: pattern = Pattern.compile("; \\D+$");
909: matcher = pattern.matcher(citation);
910: if (matcher.find()) {
911: record.createPart(SourceTitlePartStructure
912: .getInstance().getId(), matcher
913: .group().substring(2,
914: matcher.group().length()));
915: }
916: }
917: } else if (regexName.equals("psycInfo")) {
918: // ^Vol \d+\([\w\p{Punct}]+\))
919: if (!hasVolume) {
920: pattern = Pattern.compile("^Vol \\d+");
921: matcher = pattern.matcher(citation);
922: if (matcher.find()) {
923: record.createPart(VolumePartStructure
924: .getInstance().getId(), matcher
925: .group().replaceAll("\\D", ""));
926: }
927: }
928:
929: if (!hasIssue) {
930: pattern = Pattern.compile("\\(.+\\)");
931: matcher = pattern.matcher(citation);
932: if (matcher.find()) {
933: record
934: .createPart(
935: IssuePartStructure
936: .getInstance()
937: .getId(),
938: matcher
939: .group()
940: .substring(
941: 1,
942: matcher
943: .group()
944: .length() - 1));
945: }
946: }
947: }
948: } catch (org.osid.repository.RepositoryException re) {
949: LOG.warn("doRegexParse() failed getting "
950: + "PartStructure Types.", re);
951: }
952: }
953: }
954:
955: private void createPagesPart(String text)
956: throws org.osid.repository.RepositoryException {
957: if (text.charAt(0) == ',') {
958: // getting a poorly formatted field
959: return;
960: }
961:
962: record.createPart(PagesPartStructure.getInstance().getId(),
963: text);
964:
965: // get start and end page if possible
966: String[] pages = text.split("-");
967:
968: if (pages.length == 0) {
969: // cannot create start/end page.
970: return;
971: }
972:
973: String spage = pages[0].trim();
974:
975: // delete all non-digit chars (ie: p., pp., etc.)
976: spage = spage.replaceAll("\\D", "");
977:
978: // create startPage part
979: record.createPart(StartPagePartStructure.getInstance().getId(),
980: spage);
981:
982: // end page
983: if (pages.length == 2) {
984: String epage = pages[1].trim();
985: epage = epage.replaceAll("\\D", "");
986: record.createPart(EndPagePartStructure.getInstance()
987: .getId(), epage);
988: }
989: }
990:
991: private String getId() {
992: return "asset" + Math.random() * 1000
993: + System.currentTimeMillis();
994: }
995: }
|