001: /*
002: * Copyright 2005 Sun Microsystems, Inc.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: *
016: */
017: package com.sun.syndication.io.impl;
018:
019: import java.io.IOException;
020: import java.io.Reader;
021: import java.io.InputStreamReader;
022: import java.io.BufferedReader;
023: import java.util.HashMap;
024: import java.util.Map;
025: import java.util.regex.Pattern;
026: import java.util.regex.Matcher;
027: import java.net.URL;
028:
029: /**
030: * @author Alejandro Abdelnur
031: */
032: public class XmlFixerReader extends Reader {
033:
034: public static void main(String[] args) throws Exception {
035: Reader r = new InputStreamReader(new URL(args[0]).openStream());
036: r = new XmlFixerReader(r);
037: BufferedReader br = new BufferedReader(r);
038: String l = br.readLine();
039: while (l != null) {
040: System.out.println(l);
041: l = br.readLine();
042: }
043: }
044:
045: protected Reader in;
046:
047: public XmlFixerReader(Reader in) {
048: super (in);
049: this .in = in;
050: _buffer = new StringBuffer();
051: _state = 0;
052: }
053:
054: private boolean trimmed;
055: private StringBuffer _buffer;
056: private int _bufferPos;
057: private int _state = 0;
058:
059: private boolean trimStream() throws IOException {
060: boolean hasContent = true;
061: int state = 0;
062: boolean loop;
063: int c;
064: do {
065: switch (state) {
066: case 0:
067: c = in.read();
068: if (c == -1) {
069: loop = false;
070: hasContent = false;
071: } else if (c == ' ' || c == '\n') {
072: loop = true;
073: } else if (c == '<') {
074: state = 1;
075: _buffer.setLength(0);
076: _bufferPos = 0;
077: _buffer.append((char) c);
078: loop = true;
079: } else {
080: _buffer.setLength(0);
081: _bufferPos = 0;
082: _buffer.append((char) c);
083: loop = false;
084: hasContent = true;
085: _state = 3;
086: }
087: break;
088: case 1:
089: c = in.read();
090: if (c == -1) {
091: loop = false;
092: hasContent = true;
093: _state = 3;
094: } else if (c != '!') {
095: _buffer.append((char) c);
096: _state = 3;
097: loop = false;
098: hasContent = true;
099: _state = 3;
100: } else {
101: _buffer.append((char) c);
102: state = 2;
103: loop = true;
104: }
105: break;
106: case 2:
107: c = in.read();
108: if (c == -1) {
109: loop = false;
110: hasContent = true;
111: _state = 3;
112: } else if (c == '-') {
113: _buffer.append((char) c);
114: state = 3;
115: loop = true;
116: } else {
117: _buffer.append((char) c);
118: loop = false;
119: hasContent = true;
120: _state = 3;
121: }
122: break;
123: case 3:
124: c = in.read();
125: if (c == -1) {
126: loop = false;
127: hasContent = true;
128: _state = 3;
129: } else if (c == '-') {
130: _buffer.append((char) c);
131: state = 4;
132: loop = true;
133: } else {
134: _buffer.append((char) c);
135: loop = false;
136: hasContent = true;
137: _state = 3;
138: }
139: break;
140: case 4:
141: c = in.read();
142: if (c == -1) {
143: loop = false;
144: hasContent = true;
145: _state = 3;
146: } else if (c != '-') {
147: _buffer.append((char) c);
148: loop = true;
149: } else {
150: _buffer.append((char) c);
151: state = 5;
152: loop = true;
153: }
154: break;
155: case 5:
156: c = in.read();
157: if (c == -1) {
158: loop = false;
159: hasContent = true;
160: _state = 3;
161: } else if (c != '-') {
162: _buffer.append((char) c);
163: loop = true;
164: state = 4;
165: } else {
166: _buffer.append((char) c);
167: state = 6;
168: loop = true;
169: }
170: break;
171: case 6:
172: c = in.read();
173: if (c == -1) {
174: loop = false;
175: hasContent = true;
176: _state = 3;
177: } else if (c != '>') {
178: _buffer.append((char) c);
179: loop = true;
180: state = 4;
181: } else {
182: _buffer.setLength(0);
183: state = 0;
184: loop = true;
185: }
186: break;
187: default:
188: throw new IOException("It shouldn't happen");
189: }
190: } while (loop);
191: return hasContent;
192: }
193:
194: public int read() throws IOException {
195: boolean loop;
196: if (!trimmed) { // trims XML stream
197: trimmed = true;
198: if (!trimStream()) {
199: return -1;
200: }
201: }
202: int c;
203: do { // converts literal entities to coded entities
204: switch (_state) {
205: case 0: // reading chars from stream
206: c = in.read();
207: if (c > -1) {
208: if (c == '&') {
209: _state = 1;
210: _buffer.setLength(0);
211: _bufferPos = 0;
212: _buffer.append((char) c);
213: _state = 1;
214: loop = true;
215: } else {
216: loop = false;
217: }
218: } else {
219: loop = false;
220: }
221: break;
222: case 1: // reading entity from stream
223: c = in.read();
224: if (c > -1) {
225: if (c == ';') {
226: _buffer.append((char) c);
227: _state = 2;
228: loop = true;
229: } else if ((c >= 'a' && c <= 'z')
230: || (c >= 'A' && c <= 'Z') || (c == '#')
231: || (c >= '0' && c <= '9')) {
232: _buffer.append((char) c);
233: loop = true;
234: } else {
235: _buffer.append((char) c);
236: _state = 3;
237: loop = true;
238: }
239: } else {
240: _state = 3;
241: loop = true;
242: }
243: break;
244: case 2: // replacing entity
245: c = 0;
246: String literalEntity = _buffer.toString();
247: String codedEntity = (String) CODED_ENTITIES
248: .get(literalEntity);
249: if (codedEntity != null) {
250: _buffer.setLength(0);
251: _buffer.append(codedEntity);
252: } // else we leave what was in the stream
253: _state = 3;
254: loop = true;
255: break;
256: case 3: // consuming buffer
257: if (_bufferPos < _buffer.length()) {
258: c = _buffer.charAt(_bufferPos++);
259: loop = false;
260: } else {
261: c = 0;
262: _state = 0;
263: loop = true;
264: }
265: break;
266: default:
267: throw new IOException("It shouldn't happen");
268: }
269: } while (loop);
270: return c;
271: }
272:
273: public int read(char[] buffer, int offset, int len)
274: throws IOException {
275: int charsRead = 0;
276: int c = read();
277: if (c == -1) {
278: return -1;
279: }
280: buffer[offset + (charsRead++)] = (char) c;
281: while (charsRead < len && (c = read()) > -1) {
282: buffer[offset + (charsRead++)] = (char) c;
283: }
284: return charsRead;
285: }
286:
287: public long skip(long n) throws IOException {
288: if (n == 0) {
289: return 0;
290: } else if (n < 0) {
291: throw new IllegalArgumentException("'n' cannot be negative");
292: }
293: int c = read();
294: long counter = 1;
295: while (c > -1 && counter < n) {
296: c = read();
297: counter++;
298: }
299: return counter;
300: }
301:
302: public boolean ready() throws IOException {
303: return (_state != 0) || in.ready();
304: }
305:
306: public boolean markSupported() {
307: return false;
308: }
309:
310: public void mark(int readAheadLimit) throws IOException {
311: throw new IOException("Stream does not support mark");
312: }
313:
314: public void reset() throws IOException {
315: throw new IOException("Stream does not support mark");
316: }
317:
318: public void close() throws IOException {
319: in.close();
320: }
321:
322: private static Map CODED_ENTITIES = new HashMap();
323:
324: static {
325: // note: refer to Character entity references in HTML 4
326: // at http://www.w3.org/TR/REC-html40/sgml/entities.html
327:
328: // Character entity set.
329: // HTMLlat1 "-//W3C//ENTITIES Latin 1//EN//HTML"
330:
331: CODED_ENTITIES.put(" ", " ");
332: CODED_ENTITIES.put("¡", "¡");
333: CODED_ENTITIES.put("¢", "¢");
334: CODED_ENTITIES.put("£", "£");
335: CODED_ENTITIES.put("¤", "¤");
336: CODED_ENTITIES.put("¥", "¥");
337: CODED_ENTITIES.put("¦", "¦");
338: CODED_ENTITIES.put("§", "§");
339: CODED_ENTITIES.put("¨", "¨");
340: CODED_ENTITIES.put("©", "©");
341: CODED_ENTITIES.put("ª", "ª");
342: CODED_ENTITIES.put("«", "«");
343: CODED_ENTITIES.put("¬", "¬");
344: CODED_ENTITIES.put("­", "­");
345: CODED_ENTITIES.put("®", "®");
346: CODED_ENTITIES.put("¯", "¯");
347: CODED_ENTITIES.put("°", "°");
348: CODED_ENTITIES.put("±", "±");
349: CODED_ENTITIES.put("²", "²");
350: CODED_ENTITIES.put("³", "³");
351: CODED_ENTITIES.put("´", "´");
352: CODED_ENTITIES.put("µ", "µ");
353: CODED_ENTITIES.put("¶", "¶");
354: CODED_ENTITIES.put("·", "·");
355: CODED_ENTITIES.put("¸", "¸");
356: CODED_ENTITIES.put("¹", "¹");
357: CODED_ENTITIES.put("º", "º");
358: CODED_ENTITIES.put("»", "»");
359: CODED_ENTITIES.put("¼", "¼");
360: CODED_ENTITIES.put("½", "½");
361: CODED_ENTITIES.put("¾", "¾");
362: CODED_ENTITIES.put("¿", "¿");
363: CODED_ENTITIES.put("À", "À");
364: CODED_ENTITIES.put("Á", "Á");
365: CODED_ENTITIES.put("Â", "Â");
366: CODED_ENTITIES.put("Ã", "Ã");
367: CODED_ENTITIES.put("Ä", "Ä");
368: CODED_ENTITIES.put("Å", "Å");
369: CODED_ENTITIES.put("Æ", "Æ");
370: CODED_ENTITIES.put("Ç", "Ç");
371: CODED_ENTITIES.put("È", "È");
372: CODED_ENTITIES.put("É", "É");
373: CODED_ENTITIES.put("Ê", "Ê");
374: CODED_ENTITIES.put("Ë", "Ë");
375: CODED_ENTITIES.put("Ì", "Ì");
376: CODED_ENTITIES.put("Í", "Í");
377: CODED_ENTITIES.put("Î", "Î");
378: CODED_ENTITIES.put("Ï", "Ï");
379: CODED_ENTITIES.put("Ð", "Ð");
380: CODED_ENTITIES.put("Ñ", "Ñ");
381: CODED_ENTITIES.put("Ò", "Ò");
382: CODED_ENTITIES.put("Ó", "Ó");
383: CODED_ENTITIES.put("Ô", "Ô");
384: CODED_ENTITIES.put("Õ", "Õ");
385: CODED_ENTITIES.put("Ö", "Ö");
386: CODED_ENTITIES.put("×", "×");
387: CODED_ENTITIES.put("Ø", "Ø");
388: CODED_ENTITIES.put("Ù", "Ù");
389: CODED_ENTITIES.put("Ú", "Ú");
390: CODED_ENTITIES.put("Û", "Û");
391: CODED_ENTITIES.put("Ü", "Ü");
392: CODED_ENTITIES.put("Ý", "Ý");
393: CODED_ENTITIES.put("Þ", "Þ");
394: CODED_ENTITIES.put("ß", "ß");
395: CODED_ENTITIES.put("à", "à");
396: CODED_ENTITIES.put("á", "á");
397: CODED_ENTITIES.put("â", "â");
398: CODED_ENTITIES.put("ã", "ã");
399: CODED_ENTITIES.put("ä", "ä");
400: CODED_ENTITIES.put("å", "å");
401: CODED_ENTITIES.put("æ", "æ");
402: CODED_ENTITIES.put("ç", "ç");
403: CODED_ENTITIES.put("è", "è");
404: CODED_ENTITIES.put("é", "é");
405: CODED_ENTITIES.put("ê", "ê");
406: CODED_ENTITIES.put("ë", "ë");
407: CODED_ENTITIES.put("ì", "ì");
408: CODED_ENTITIES.put("í", "í");
409: CODED_ENTITIES.put("î", "î");
410: CODED_ENTITIES.put("ï", "ï");
411: CODED_ENTITIES.put("ð", "ð");
412: CODED_ENTITIES.put("ñ", "ñ");
413: CODED_ENTITIES.put("ò", "ò");
414: CODED_ENTITIES.put("ó", "ó");
415: CODED_ENTITIES.put("ô", "ô");
416: CODED_ENTITIES.put("õ", "õ");
417: CODED_ENTITIES.put("ö", "ö");
418: CODED_ENTITIES.put("÷", "÷");
419: CODED_ENTITIES.put("ø", "ø");
420: CODED_ENTITIES.put("ù", "ù");
421: CODED_ENTITIES.put("ú", "ú");
422: CODED_ENTITIES.put("û", "û");
423: CODED_ENTITIES.put("ü", "ü");
424: CODED_ENTITIES.put("ý", "ý");
425: CODED_ENTITIES.put("þ", "þ");
426: CODED_ENTITIES.put("ÿ", "ÿ");
427:
428: // Mathematical, Greek and Symbolic characters for HTML.
429: // HTMLsymbol "-//W3C//ENTITIES Symbols//EN//HTML"
430:
431: CODED_ENTITIES.put("ƒ", "ƒ");
432: CODED_ENTITIES.put("Α", "Α");
433: CODED_ENTITIES.put("Β", "Β");
434: CODED_ENTITIES.put("Γ", "Γ");
435: CODED_ENTITIES.put("Δ", "Δ");
436: CODED_ENTITIES.put("Ε", "Ε");
437: CODED_ENTITIES.put("Ζ", "Ζ");
438: CODED_ENTITIES.put("Η", "Η");
439: CODED_ENTITIES.put("Θ", "Θ");
440: CODED_ENTITIES.put("Ι", "Ι");
441: CODED_ENTITIES.put("Κ", "Κ");
442: CODED_ENTITIES.put("Λ", "Λ");
443: CODED_ENTITIES.put("Μ", "Μ");
444: CODED_ENTITIES.put("Ν", "Ν");
445: CODED_ENTITIES.put("Ξ", "Ξ");
446: CODED_ENTITIES.put("Ο", "Ο");
447: CODED_ENTITIES.put("Π", "Π");
448: CODED_ENTITIES.put("Ρ", "Ρ");
449: CODED_ENTITIES.put("Σ", "Σ");
450: CODED_ENTITIES.put("Τ", "Τ");
451: CODED_ENTITIES.put("Υ", "Υ");
452: CODED_ENTITIES.put("Φ", "Φ");
453: CODED_ENTITIES.put("Χ", "Χ");
454: CODED_ENTITIES.put("Ψ", "Ψ");
455: CODED_ENTITIES.put("Ω", "Ω");
456: CODED_ENTITIES.put("α", "α");
457: CODED_ENTITIES.put("β", "β");
458: CODED_ENTITIES.put("γ", "γ");
459: CODED_ENTITIES.put("δ", "δ");
460: CODED_ENTITIES.put("ε", "ε");
461: CODED_ENTITIES.put("ζ", "ζ");
462: CODED_ENTITIES.put("η", "η");
463: CODED_ENTITIES.put("θ", "θ");
464: CODED_ENTITIES.put("ι", "ι");
465: CODED_ENTITIES.put("κ", "κ");
466: CODED_ENTITIES.put("λ", "λ");
467: CODED_ENTITIES.put("μ", "μ");
468: CODED_ENTITIES.put("ν", "ν");
469: CODED_ENTITIES.put("ξ", "ξ");
470: CODED_ENTITIES.put("ο", "ο");
471: CODED_ENTITIES.put("π", "π");
472: CODED_ENTITIES.put("ρ", "ρ");
473: CODED_ENTITIES.put("ς", "ς");
474: CODED_ENTITIES.put("σ", "σ");
475: CODED_ENTITIES.put("τ", "τ");
476: CODED_ENTITIES.put("υ", "υ");
477: CODED_ENTITIES.put("φ", "φ");
478: CODED_ENTITIES.put("χ", "χ");
479: CODED_ENTITIES.put("ψ", "ψ");
480: CODED_ENTITIES.put("ω", "ω");
481: CODED_ENTITIES.put("ϑ", "ϑ");
482: CODED_ENTITIES.put("ϒ", "ϒ");
483: CODED_ENTITIES.put("ϖ", "ϖ");
484: CODED_ENTITIES.put("•", "•");
485: CODED_ENTITIES.put("…", "…");
486: CODED_ENTITIES.put("′", "′");
487: CODED_ENTITIES.put("″", "″");
488: CODED_ENTITIES.put("‾", "‾");
489: CODED_ENTITIES.put("⁄", "⁄");
490: CODED_ENTITIES.put("℘", "℘");
491: CODED_ENTITIES.put("ℑ", "ℑ");
492: CODED_ENTITIES.put("ℜ", "ℜ");
493: CODED_ENTITIES.put("™", "™");
494: CODED_ENTITIES.put("ℵ", "ℵ");
495: CODED_ENTITIES.put("←", "←");
496: CODED_ENTITIES.put("↑", "↑");
497: CODED_ENTITIES.put("→", "→");
498: CODED_ENTITIES.put("↓", "↓");
499: CODED_ENTITIES.put("↔", "↔");
500: CODED_ENTITIES.put("↵", "↵");
501: CODED_ENTITIES.put("⇐", "⇐");
502: CODED_ENTITIES.put("⇑", "⇑");
503: CODED_ENTITIES.put("⇒", "⇒");
504: CODED_ENTITIES.put("⇓", "⇓");
505: CODED_ENTITIES.put("⇔", "⇔");
506: CODED_ENTITIES.put("∀", "∀");
507: CODED_ENTITIES.put("∂", "∂");
508: CODED_ENTITIES.put("∃", "∃");
509: CODED_ENTITIES.put("∅", "∅");
510: CODED_ENTITIES.put("∇", "∇");
511: CODED_ENTITIES.put("∈", "∈");
512: CODED_ENTITIES.put("∉", "∉");
513: CODED_ENTITIES.put("∋", "∋");
514: CODED_ENTITIES.put("∏", "∏");
515: CODED_ENTITIES.put("∑", "∑");
516: CODED_ENTITIES.put("−", "−");
517: CODED_ENTITIES.put("∗", "∗");
518: CODED_ENTITIES.put("√", "√");
519: CODED_ENTITIES.put("∝", "∝");
520: CODED_ENTITIES.put("∞", "∞");
521: CODED_ENTITIES.put("∠", "∠");
522: CODED_ENTITIES.put("∧", "∧");
523: CODED_ENTITIES.put("∨", "∨");
524: CODED_ENTITIES.put("∩", "∩");
525: CODED_ENTITIES.put("∪", "∪");
526: CODED_ENTITIES.put("∫", "∫");
527: CODED_ENTITIES.put("∴", "∴");
528: CODED_ENTITIES.put("∼", "∼");
529: CODED_ENTITIES.put("≅", "≅");
530: CODED_ENTITIES.put("≈", "≈");
531: CODED_ENTITIES.put("≠", "≠");
532: CODED_ENTITIES.put("≡", "≡");
533: CODED_ENTITIES.put("≤", "≤");
534: CODED_ENTITIES.put("≥", "≥");
535: CODED_ENTITIES.put("⊂", "⊂");
536: CODED_ENTITIES.put("⊃", "⊃");
537: CODED_ENTITIES.put("⊄", "⊄");
538: CODED_ENTITIES.put("⊆", "⊆");
539: CODED_ENTITIES.put("⊇", "⊇");
540: CODED_ENTITIES.put("⊕", "⊕");
541: CODED_ENTITIES.put("⊗", "⊗");
542: CODED_ENTITIES.put("⊥", "⊥");
543: CODED_ENTITIES.put("⋅", "⋅");
544: CODED_ENTITIES.put("⌈", "⌈");
545: CODED_ENTITIES.put("⌉", "⌉");
546: CODED_ENTITIES.put("⌊", "⌊");
547: CODED_ENTITIES.put("⌋", "⌋");
548: CODED_ENTITIES.put("⟨", "〈");
549: CODED_ENTITIES.put("⟩", "〉");
550: CODED_ENTITIES.put("◊", "◊");
551: CODED_ENTITIES.put("♠", "♠");
552: CODED_ENTITIES.put("♣", "♣");
553: CODED_ENTITIES.put("♥", "♥");
554: CODED_ENTITIES.put("♦", "♦");
555:
556: // Special characters for HTML.
557: // HTMLspecial "-//W3C//ENTITIES Special//EN//HTML"
558:
559: CODED_ENTITIES.put(""", """);
560: CODED_ENTITIES.put("&", "&");
561: CODED_ENTITIES.put("<", "<");
562: CODED_ENTITIES.put(">", ">");
563: CODED_ENTITIES.put("Œ", "Œ");
564: CODED_ENTITIES.put("œ", "œ");
565: CODED_ENTITIES.put("Š", "Š");
566: CODED_ENTITIES.put("š", "š");
567: CODED_ENTITIES.put("Ÿ", "Ÿ");
568: CODED_ENTITIES.put("ˆ", "ˆ");
569: CODED_ENTITIES.put("˜", "˜");
570: CODED_ENTITIES.put(" ", " ");
571: CODED_ENTITIES.put(" ", " ");
572: CODED_ENTITIES.put(" ", " ");
573: CODED_ENTITIES.put("‌", "‌");
574: CODED_ENTITIES.put("‍", "‍");
575: CODED_ENTITIES.put("‎", "‎");
576: CODED_ENTITIES.put("‏", "‏");
577: CODED_ENTITIES.put("–", "–");
578: CODED_ENTITIES.put("—", "—");
579: CODED_ENTITIES.put("‘", "‘");
580: CODED_ENTITIES.put("’", "’");
581: CODED_ENTITIES.put("‚", "‚");
582: CODED_ENTITIES.put("“", "“");
583: CODED_ENTITIES.put("”", "”");
584: CODED_ENTITIES.put("„", "„");
585: CODED_ENTITIES.put("†", "†");
586: CODED_ENTITIES.put("‡", "‡");
587: CODED_ENTITIES.put("‰", "‰");
588: CODED_ENTITIES.put("‹", "‹");
589: CODED_ENTITIES.put("›", "›");
590: CODED_ENTITIES.put("€", "€");
591: }
592:
593: //
594: // It shouldn't be here but well, just reusing the CODED_ENTITIES Map :)
595: //
596:
597: private static Pattern ENTITIES_PATTERN = Pattern
598: .compile("&[A-Za-z^#]+;");
599:
600: public String processHtmlEntities(String s) {
601: if (s.indexOf('&') == -1) {
602: return s;
603: }
604: StringBuffer sb = new StringBuffer(s.length());
605: int pos = 0;
606: while (pos < s.length()) {
607: String chunck = s.substring(pos);
608: Matcher m = ENTITIES_PATTERN.matcher(chunck);
609: if (m.find()) {
610: int b = pos + m.start();
611: int e = pos + m.end();
612: if (b > pos) {
613: sb.append(s.substring(pos, b));
614: pos = b;
615: }
616: chunck = s.substring(pos, e);
617: String codedEntity = (String) CODED_ENTITIES
618: .get(chunck);
619: if (codedEntity == null) {
620: codedEntity = chunck;
621: }
622: sb.append(codedEntity);
623: pos = e;
624: } else {
625: sb.append(chunck);
626: pos += chunck.length();
627: }
628: }
629: return sb.toString();
630: }
631:
632: }
|