001: // Jericho HTML Parser - Java based library for analysing and manipulating HTML
002: // Version 2.5
003: // Copyright (C) 2007 Martin Jericho
004: // http://jerichohtml.sourceforge.net/
005: //
006: // This library is free software; you can redistribute it and/or
007: // modify it under the terms of either one of the following licences:
008: //
009: // 1. The Eclipse Public License (EPL) version 1.0,
010: // included in this distribution in the file licence-epl-1.0.html
011: // or available at http://www.eclipse.org/legal/epl-v10.html
012: //
013: // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
014: // included in this distribution in the file licence-lgpl-2.1.txt
015: // or available at http://www.gnu.org/licenses/lgpl.txt
016: //
017: // This library is distributed on an "AS IS" basis,
018: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
019: // See the individual licence texts for more details.
020:
021: package au.id.jericho.lib.html;
022:
023: import java.util.*;
024:
025: /**
026: * Represents a cached map of character positions to tags for a particular tag type,
027: * or for all tag types if the tagType field is null.
028: */
029: final class SubCache {
030: private final Cache cache;
031: public final TagType tagType; // does not support unregistered tag types at present
032: private final CacheEntry bof; // beginning of file marker
033: private final CacheEntry eof; // end of file marker
034: private CacheEntry[] array = new CacheEntry[INITIAL_CAPACITY];
035:
036: private static final int INITIAL_CAPACITY = 64;
037:
038: public SubCache(final Cache cache, final TagType tagType) {
039: this .cache = cache;
040: this .tagType = tagType;
041: array[0] = bof = new CacheEntry(0, -1, null, false, false);
042: array[1] = eof = new CacheEntry(1, cache.getSourceLength(),
043: null, false, false);
044: }
045:
046: public int size() {
047: return eof.index + 1;
048: }
049:
050: public void clear() {
051: bof.nextCached = false;
052: eof.index = 1;
053: eof.previousCached = false;
054: array[1] = eof;
055: }
056:
057: public void bulkLoad_Init(final int tagCount) {
058: array = new CacheEntry[tagCount + 2];
059: array[0] = bof;
060: bof.nextCached = true;
061: array[eof.index = tagCount + 1] = eof;
062: eof.previousCached = true;
063: }
064:
065: public void bulkLoad_Set(final int tagsIndex, final Tag tag) {
066: final int index = tagsIndex + 1;
067: array[index] = new CacheEntry(index, tag.begin, tag, true, true);
068: }
069:
070: public void bulkLoad_AddToTypeSpecificCache(final Tag tag) {
071: final int index = eof.index;
072: if (array.length == eof.index + 1)
073: doubleCapacity();
074: array[index] = new CacheEntry(index, tag.begin, tag, true, true);
075: eof.index++;
076: }
077:
078: public void bulkLoad_FinaliseTypeSpecificCache() {
079: bof.nextCached = true;
080: eof.previousCached = true;
081: array[eof.index] = eof;
082: }
083:
084: public Tag getTagAt(final int pos) {
085: // This must only be called on allTagTypesSubCache (ie tagType==null)
086: if (cache.getSourceLength() == 0)
087: return null;
088: if (pos < 0 || pos >= cache.getSourceLength())
089: return null;
090: final int index = getIndexOfPos(pos);
091: final CacheEntry cacheEntry = array[index];
092: if (cacheEntry.pos == pos)
093: return cacheEntry.tag;
094: if (cacheEntry.previousCached)
095: return null;
096: return cache.addTagAt(pos);
097: }
098:
099: public void addTagAt(final int pos, final Tag tag) {
100: final int index = getIndexOfPos(pos);
101: final CacheEntry nextCacheEntry = array[index];
102: final CacheEntry previousCacheEntry = getPrevious(nextCacheEntry);
103: add(previousCacheEntry, new CacheEntry(index, pos, tag,
104: pos == previousCacheEntry.pos + 1,
105: pos == nextCacheEntry.pos - 1), nextCacheEntry);
106: }
107:
108: public Tag findPreviousOrNextTag(final int pos,
109: final boolean previous) {
110: // Note that this method never returns tags for which tag.includInSearch() is false, so separate caching of unregistered tags won't work.
111: if (cache.getSourceLength() == 0)
112: return null;
113: if (pos < 0 || pos >= cache.getSourceLength())
114: return null;
115: int index = getIndexOfPos(pos);
116: final CacheEntry cacheEntry = array[index];
117: final Tag tag;
118: if (previous) {
119: if (cacheEntry.pos == pos && cacheEntry.tag != null
120: && cacheEntry.tag.includeInSearch())
121: return cacheEntry.tag;
122: tag = findPreviousTag(getPrevious(cacheEntry), pos,
123: cacheEntry);
124: addPreviousTag(pos, tag);
125: } else {
126: if (cacheEntry.pos == pos) {
127: if (cacheEntry.tag != null
128: && cacheEntry.tag.includeInSearch())
129: return cacheEntry.tag;
130: tag = findNextTag(cacheEntry, pos, getNext(cacheEntry));
131: } else {
132: tag = findNextTag(getPrevious(cacheEntry), pos,
133: cacheEntry);
134: }
135: addNextTag(pos, tag);
136: }
137: return tag;
138: }
139:
140: public Iterator getTagIterator() {
141: return new TagIterator();
142: }
143:
144: public String toString() {
145: return appendTo(new StringBuffer()).toString();
146: }
147:
148: protected StringBuffer appendTo(final StringBuffer sb) {
149: sb.append("Cache for TagType : ").append(tagType).append(
150: Config.NewLine);
151: for (int i = 0; i <= lastIndex(); i++)
152: sb.append(array[i]).append(Config.NewLine);
153: return sb;
154: }
155:
156: private Tag findPreviousTag(CacheEntry previousCacheEntry, int pos,
157: CacheEntry nextCacheEntry) {
158: // previousCacheEntry.pos < pos <= nextCacheEntry.pos
159: while (true) {
160: if (!nextCacheEntry.previousCached) {
161: final Tag tag = Tag.findPreviousOrNextTagUncached(
162: cache.source, pos, tagType, true,
163: previousCacheEntry.pos); // if useAllTypesCache is true, automatically adds tag to all caches if one is found, and maybe some unregistered tags along the way.
164: if (tag != null) {
165: if (!cache.source.useAllTypesCache)
166: addTagAt(tag.begin, tag); // have to add tag manually if useAllTypesCache is false
167: return tag;
168: }
169: }
170: if (previousCacheEntry == bof)
171: return null;
172: if (previousCacheEntry.tag != null
173: && previousCacheEntry.tag.includeInSearch())
174: return previousCacheEntry.tag;
175: pos = previousCacheEntry.pos - 1;
176: previousCacheEntry = getPrevious(nextCacheEntry = previousCacheEntry);
177: }
178: }
179:
180: private Tag findNextTag(CacheEntry previousCacheEntry, int pos,
181: CacheEntry nextCacheEntry) {
182: // previousCacheEntry.pos <= pos < nextCacheEntry.pos
183: while (true) {
184: if (!previousCacheEntry.nextCached) {
185: final Tag tag = Tag.findPreviousOrNextTagUncached(
186: cache.source, pos, tagType, false,
187: nextCacheEntry.pos); // if useAllTypesCache is true, automatically adds tag to caches if one is found, and maybe some unregistered tags along the way.
188: if (tag != null) {
189: if (!cache.source.useAllTypesCache)
190: addTagAt(tag.begin, tag); // have to add tag manually if useAllTypesCache is false
191: return tag;
192: }
193: }
194: if (nextCacheEntry == eof)
195: return null;
196: if (nextCacheEntry.tag != null
197: && nextCacheEntry.tag.includeInSearch())
198: return nextCacheEntry.tag;
199: pos = nextCacheEntry.pos + 1;
200: nextCacheEntry = getNext(previousCacheEntry = nextCacheEntry);
201: }
202: }
203:
204: private void addPreviousTag(final int pos, final Tag tag) {
205: final int tagPos = (tag == null) ? bof.pos : tag.begin;
206: if (tagPos == pos)
207: return; // the tag was found exactly on pos, so cache has already been fully updated
208: // tagPos < pos
209: int index = getIndexOfPos(pos);
210: CacheEntry stepCacheEntry = array[index];
211: // stepCacheEntry.pos is either == or > than tagPos.
212: // stepCacheEntry.pos is either == or > pos.
213: int compactStartIndex = Integer.MAX_VALUE;
214: if (stepCacheEntry.pos == pos) {
215: // a cache entry was aleady at pos (containing null or wrong tagType)
216: stepCacheEntry.previousCached = true;
217: if (stepCacheEntry.isRedundant()) {
218: stepCacheEntry.removed = true;
219: compactStartIndex = Math.min(compactStartIndex,
220: stepCacheEntry.index);
221: }
222: } else if (!stepCacheEntry.previousCached) {
223: // we have to add a new cacheEntry at pos:
224: if (tagType == null)
225: cache.addTagAt(pos); // this pos has never been checked before, so add it to all relevant SubCaches (a null or unregistered tag entry is always added to this SubCache)
226: else
227: addTagAt(pos, null); // all we know is that the pos doesn't contain a tag of this SubCache's type, so add a null entry to this SubCache only.
228: // now we have to reload the index and stepCacheEntry as they may have changed:
229: stepCacheEntry = array[index = getIndexOfPos(pos)];
230: // stepCacheEntry.pos is either == or > than tagPos.
231: // stepCacheEntry.pos is either == or > pos. (the latter if the added entry was redundant)
232: if (stepCacheEntry.pos == pos) {
233: // perform same steps as in the (stepCacheEntry.pos==pos) if condition above:
234: stepCacheEntry.previousCached = true;
235: if (stepCacheEntry.isRedundant()) {
236: stepCacheEntry.removed = true;
237: compactStartIndex = Math.min(compactStartIndex,
238: stepCacheEntry.index);
239: }
240: }
241: }
242: while (true) {
243: stepCacheEntry = array[--index];
244: if (stepCacheEntry.pos <= tagPos)
245: break;
246: if (stepCacheEntry.tag != null) {
247: if (stepCacheEntry.tag.includeInSearch())
248: throw new SourceCacheEntryMissingInternalError(
249: tagType, tag, this );
250: stepCacheEntry.previousCached = true;
251: stepCacheEntry.nextCached = true;
252: } else {
253: stepCacheEntry.removed = true;
254: compactStartIndex = Math.min(compactStartIndex,
255: stepCacheEntry.index);
256: }
257: }
258: if (stepCacheEntry.pos != tagPos)
259: throw new FoundCacheEntryMissingInternalError(tagType, tag,
260: this );
261: stepCacheEntry.nextCached = true;
262: compact(compactStartIndex);
263: }
264:
265: private void addNextTag(final int pos, final Tag tag) {
266: final int tagPos = (tag == null) ? eof.pos : tag.begin;
267: if (tagPos == pos)
268: return; // the tag was found exactly on pos, so cache has already been fully updated
269: // tagPos > pos
270: int index = getIndexOfPos(pos);
271: CacheEntry stepCacheEntry = array[index];
272: // stepCacheEntry.pos may be <, == or > than tagPos.
273: // stepCacheEntry.pos is either == or > pos.
274: int compactStartIndex = Integer.MAX_VALUE;
275: if (stepCacheEntry.pos == pos) {
276: // a cache entry was aleady at pos (containing null or wrong tagType)
277: stepCacheEntry.nextCached = true;
278: if (stepCacheEntry.isRedundant()) {
279: stepCacheEntry.removed = true;
280: compactStartIndex = Math.min(compactStartIndex,
281: stepCacheEntry.index);
282: }
283: } else if (!getPrevious(stepCacheEntry).nextCached) {
284: // we have to add a new cacheEntry at pos:
285: if (tagType == null)
286: cache.addTagAt(pos); // this pos has never been checked before, so add it to all relevant SubCaches (a null or unregistered tag entry is always added to this SubCache)
287: else
288: addTagAt(pos, null); // all we know is that the pos doesn't contain a tag of this SubCache's type, so add a null entry to this SubCache only.
289: // now we have to reload the index and stepCacheEntry as they may have changed:
290: stepCacheEntry = array[index = getIndexOfPos(pos)];
291: // stepCacheEntry.pos may be <, == or > than tagPos.
292: // stepCacheEntry.pos is either == or > pos. (the latter if the added entry was redundant)
293: if (stepCacheEntry.pos == pos) {
294: // perform same steps as in the (stepCacheEntry.pos==pos) if condition above:
295: stepCacheEntry.nextCached = true;
296: if (stepCacheEntry.isRedundant()) {
297: stepCacheEntry.removed = true;
298: compactStartIndex = Math.min(compactStartIndex,
299: stepCacheEntry.index);
300: }
301: }
302: }
303: if (stepCacheEntry.pos < tagPos) {
304: while (true) {
305: stepCacheEntry = array[++index];
306: if (stepCacheEntry.pos >= tagPos)
307: break;
308: if (stepCacheEntry.tag != null) {
309: if (stepCacheEntry.tag.includeInSearch())
310: throw new SourceCacheEntryMissingInternalError(
311: tagType, tag, this );
312: stepCacheEntry.previousCached = true;
313: stepCacheEntry.nextCached = true;
314: } else {
315: stepCacheEntry.removed = true;
316: compactStartIndex = Math.min(compactStartIndex,
317: stepCacheEntry.index);
318: }
319: }
320: if (stepCacheEntry.pos != tagPos)
321: throw new FoundCacheEntryMissingInternalError(tagType,
322: tag, this );
323: }
324: stepCacheEntry.previousCached = true;
325: compact(compactStartIndex);
326: }
327:
328: private void compact(int i) {
329: final int lastIndex = lastIndex();
330: int removedCount = 1;
331: while (i < lastIndex) {
332: final CacheEntry cacheEntry = array[++i];
333: if (cacheEntry.removed)
334: removedCount++;
335: else
336: array[cacheEntry.index = i - removedCount] = cacheEntry;
337: }
338: }
339:
340: private void add(final CacheEntry previousCacheEntry,
341: final CacheEntry newCacheEntry,
342: final CacheEntry nextCacheEntry) {
343: if (!newCacheEntry.isRedundant())
344: insert(newCacheEntry);
345: if (newCacheEntry.previousCached) {
346: previousCacheEntry.nextCached = true;
347: if (previousCacheEntry.isRedundant())
348: remove(previousCacheEntry);
349: }
350: if (newCacheEntry.nextCached) {
351: nextCacheEntry.previousCached = true;
352: if (nextCacheEntry.isRedundant())
353: remove(nextCacheEntry);
354: }
355: }
356:
357: private int getIndexOfPos(final int pos) {
358: // return the index of the cacheEntry at pos, or the index where it would be inserted if it does not exist.
359: int minIndex = 0;
360: int maxIndex = lastIndex();
361: // using the following complex calculation instead of a binary search is likely to result in less iterations but is slower overall:
362: // int index=(pos*maxIndex)/cache.getSourceLength(); // approximate first guess at index, assuming even distribution of cache entries
363: int index = maxIndex >> 1;
364: while (true) {
365: final CacheEntry cacheEntry = array[index];
366: if (pos > cacheEntry.pos) {
367: final CacheEntry nextCacheEntry = getNext(cacheEntry);
368: if (pos <= nextCacheEntry.pos)
369: return nextCacheEntry.index;
370: minIndex = nextCacheEntry.index;
371: } else if (pos < cacheEntry.pos) {
372: final CacheEntry previousCacheEntry = getPrevious(cacheEntry);
373: if (pos == previousCacheEntry.pos)
374: return previousCacheEntry.index;
375: if (pos > previousCacheEntry.pos)
376: return index;
377: maxIndex = previousCacheEntry.index;
378: } else {
379: return index;
380: }
381: index = (minIndex + maxIndex) >> 1;
382: // using the following complex calculation instead of a binary search is likely to result in less iterations but is slower overall:
383: // final int minIndexPos=array[minIndex].pos;
384: // index=((maxIndex-minIndex-1)*(pos-minIndexPos))/(array[maxIndex].pos-minIndexPos)+minIndex+1; // approximate next guess at index, assuming even distribution of cache entries between min and max entries
385: }
386: }
387:
388: private CacheEntry getNext(final CacheEntry cacheEntry) {
389: return array[cacheEntry.index + 1];
390: }
391:
392: private CacheEntry getPrevious(final CacheEntry cacheEntry) {
393: return array[cacheEntry.index - 1];
394: }
395:
396: private int lastIndex() {
397: return eof.index;
398: }
399:
400: private void insert(final CacheEntry cacheEntry) {
401: final int index = cacheEntry.index;
402: if (array.length == size())
403: doubleCapacity();
404: for (int i = lastIndex(); i >= index; i--) {
405: final CacheEntry movedCacheEntry = array[i];
406: array[movedCacheEntry.index = (i + 1)] = movedCacheEntry;
407: }
408: array[index] = cacheEntry;
409: }
410:
411: private void remove(final CacheEntry cacheEntry) {
412: final int lastIndex = lastIndex();
413: for (int i = cacheEntry.index; i < lastIndex; i++) {
414: final CacheEntry movedCacheEntry = array[i + 1];
415: array[movedCacheEntry.index = i] = movedCacheEntry;
416: }
417: }
418:
419: private void doubleCapacity() {
420: // assumes size==array.length
421: final CacheEntry[] newArray = new CacheEntry[array.length << 1];
422: for (int i = lastIndex(); i >= 0; i--)
423: newArray[i] = array[i];
424: array = newArray;
425: }
426:
427: private static class CacheEntryMissingInternalError extends
428: AssertionError {
429: public CacheEntryMissingInternalError(final TagType tagType,
430: final Tag tag, final SubCache subCache,
431: final String message) {
432: super (
433: "INTERNAL ERROR: Inconsistent Cache State for TagType \""
434: + tagType + "\" - " + message + ' '
435: + tag.getDebugInfo() + '\n' + subCache);
436: }
437: }
438:
439: private static class SourceCacheEntryMissingInternalError extends
440: CacheEntryMissingInternalError {
441: public SourceCacheEntryMissingInternalError(
442: final TagType tagType, final Tag tag,
443: final SubCache subCache) {
444: super (tagType, tag, subCache,
445: "cache entry no longer found in source:");
446: }
447: }
448:
449: private static class FoundCacheEntryMissingInternalError extends
450: CacheEntryMissingInternalError {
451: public FoundCacheEntryMissingInternalError(
452: final TagType tagType, final Tag tag,
453: final SubCache subCache) {
454: super (tagType, tag, subCache,
455: "missing cache entry for found tag");
456: }
457: }
458:
459: private final class TagIterator implements Iterator {
460: private int i = 0;
461: private Tag nextTag;
462:
463: public TagIterator() {
464: loadNextTag();
465: }
466:
467: public boolean hasNext() {
468: return nextTag != null;
469: }
470:
471: public Object next() {
472: final Tag result = nextTag;
473: loadNextTag();
474: return result;
475: }
476:
477: public void remove() {
478: throw new UnsupportedOperationException();
479: }
480:
481: private void loadNextTag() {
482: while (++i <= lastIndex()
483: && (nextTag = array[i].tag) == null) {
484: }
485: }
486: }
487:
488: private static final class CacheEntry {
489: public int index;
490: public final int pos;
491: public final Tag tag;
492: public boolean previousCached;
493: public boolean nextCached;
494: public boolean removed = false;
495:
496: public CacheEntry(final int index, final int pos,
497: final Tag tag, final boolean previousCached,
498: final boolean nextCached) {
499: this .index = index;
500: this .pos = pos;
501: this .tag = tag;
502: this .previousCached = previousCached;
503: this .nextCached = nextCached;
504: }
505:
506: public boolean isRedundant() {
507: return tag == null && previousCached && nextCached;
508: }
509:
510: public String toString() {
511: return pad(index, 4) + " " + pad(pos, 5) + " "
512: + (previousCached ? '|' : '-') + ' '
513: + (nextCached ? '|' : '-') + ' '
514: + (tag == null ? "null" : tag.getDebugInfo());
515: }
516:
517: private String pad(final int n, final int places) {
518: final String nstring = String.valueOf(n);
519: final StringBuffer sb = new StringBuffer(places);
520: for (int i = places - nstring.length(); i > 0; i--)
521: sb.append(' ');
522: sb.append(nstring);
523: return sb.toString();
524: }
525: }
526: }
|