0001: // plasmaCondenser.java
0002: // -----------------------
0003: // part of YaCy
0004: // (C) by Michael Peter Christen; mc@anomic.de
0005: // first published on http://www.anomic.de
0006: // Frankfurt, Germany, 2004
0007: // last change: 09.01.2004
0008: //
0009: // This program is free software; you can redistribute it and/or modify
0010: // it under the terms of the GNU General Public License as published by
0011: // the Free Software Foundation; either version 2 of the License, or
0012: // (at your option) any later version.
0013: //
0014: // This program is distributed in the hope that it will be useful,
0015: // but WITHOUT ANY WARRANTY; without even the implied warranty of
0016: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0017: // GNU General Public License for more details.
0018: //
0019: // You should have received a copy of the GNU General Public License
0020: // along with this program; if not, write to the Free Software
0021: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0022: //
0023: // Using this software in any meaning (reading, learning, copying, compiling,
0024: // running) means that you agree that the Author(s) is (are) not responsible
0025: // for cost, loss of data or any harm that may be caused directly or indirectly
0026: // by usage of this softare or this documentation. The usage of this software
0027: // is on your own risk. The installation and usage (starting/running) of this
0028: // software may allow other people or application to access your computer and
0029: // any attached devices and is highly dependent on the configuration of the
0030: // software which must be done by the user of the software; the author(s) is
0031: // (are) also not responsible for proper configuration and usage of the
0032: // software, even if provoked by documentation provided together with
0033: // the software.
0034: //
0035: // Any changes to this file according to the GPL as documented in the file
0036: // gpl.txt aside this file in the shipment you received can be done to the
0037: // lines that follows this copyright notice here, but changes must not be
0038: // done inside the copyright notive above. A re-distribution must contain
0039: // the intact and unchanged copyright notice.
0040: // Contributions and changes to the program code must be marked as such.
0041:
0042: // compile with javac -sourcepath source source/de/anomic/plasma/plasmaCondenser.java
0043: // execute with java -cp source de.anomic.plasma.plasmaCondenser
0044:
0045: package de.anomic.plasma;
0046:
0047: import java.io.BufferedReader;
0048: import java.io.ByteArrayInputStream;
0049: import java.io.File;
0050: import java.io.FileInputStream;
0051: import java.io.FileNotFoundException;
0052: import java.io.IOException;
0053: import java.io.InputStream;
0054: import java.io.InputStreamReader;
0055: import java.io.RandomAccessFile;
0056: import java.io.Reader;
0057: import java.io.UnsupportedEncodingException;
0058: import java.util.Enumeration;
0059: import java.util.HashMap;
0060: import java.util.HashSet;
0061: import java.util.Iterator;
0062: import java.util.Map;
0063: import java.util.Properties;
0064: import java.util.Set;
0065: import java.util.TreeMap;
0066: import java.util.TreeSet;
0067:
0068: import de.anomic.htmlFilter.htmlFilterContentScraper;
0069: import de.anomic.htmlFilter.htmlFilterImageEntry;
0070: import de.anomic.index.indexRWIEntry;
0071: import de.anomic.kelondro.kelondroBase64Order;
0072: import de.anomic.kelondro.kelondroBitfield;
0073: import de.anomic.kelondro.kelondroMSetTools;
0074: import de.anomic.server.serverCodings;
0075: import de.anomic.yacy.yacySeedDB;
0076: import de.anomic.yacy.yacyURL;
0077:
0078: public final class plasmaCondenser {
0079:
0080: // this is the page analysis class
0081:
0082: // category flags that show how the page can be distinguished in different interest groups
0083: public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of')
0084: public static final int flag_cat_opencontent = 1; // open source, any free stuff
0085: public static final int flag_cat_business = 2; // web shops, marketing, trade
0086: public static final int flag_cat_stockfinance = 3; // stock exchange (quotes), finance, economy
0087: public static final int flag_cat_health = 4; // health
0088: public static final int flag_cat_sport = 5; // any sport, cars etc.
0089: public static final int flag_cat_lifestyle = 6; // travel, lifestyle
0090: public static final int flag_cat_politics = 7; // politics
0091: public static final int flag_cat_news = 8; // blogs, news pages
0092: public static final int flag_cat_children = 9; // toys, childrens education, help for parents
0093: public static final int flag_cat_entertainment = 10; // boulevard, entertainment, cultural content
0094: public static final int flag_cat_knowledge = 11; // science, school stuff, help for homework
0095: public static final int flag_cat_computer = 12; // any computer related stuff, networks, operation systems
0096: public static final int flag_cat_p2p = 13; // p2p support, file-sharing archives etc.
0097: public static final int flag_cat_sex = 14; // sexual content
0098: public static final int flag_cat_spam = 15; // pages that anybody would consider as not interesting
0099: public static final int flag_cat_linux = 16; // pages about linux software
0100: public static final int flag_cat_macos = 17; // pages about macintosh, apple computers and the mac os
0101: public static final int flag_cat_windows = 18; // pages about windows os and software
0102: public static final int flag_cat_osreserve = 19; // reserve
0103: public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images
0104: public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file
0105: public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos
0106: public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file
0107:
0108: private final static int numlength = 5;
0109:
0110: //private Properties analysis;
0111: private TreeMap<String, wordStatProp> words; // a string (the words) to (wordStatProp) - relation
0112: private HashMap<StringBuffer, phraseStatProp> sentences;
0113: private int wordminsize;
0114: private int wordcut;
0115:
0116: //public int RESULT_NUMB_TEXT_BYTES = -1;
0117: public int RESULT_NUMB_WORDS = -1;
0118: public int RESULT_DIFF_WORDS = -1;
0119: public int RESULT_NUMB_SENTENCES = -1;
0120: public int RESULT_DIFF_SENTENCES = -1;
0121: public kelondroBitfield RESULT_FLAGS = new kelondroBitfield(4);
0122:
0123: public plasmaCondenser(plasmaParserDocument document,
0124: boolean indexText, boolean indexMedia)
0125: throws UnsupportedEncodingException {
0126: // if addMedia == true, then all the media links are also parsed and added to the words
0127: // added media words are flagged with the appropriate media flag
0128: this .wordminsize = 3;
0129: this .wordcut = 2;
0130: this .words = new TreeMap<String, wordStatProp>();
0131: this .sentences = new HashMap<StringBuffer, phraseStatProp>();
0132: this .RESULT_FLAGS = new kelondroBitfield(4);
0133:
0134: //System.out.println("DEBUG: condensing " + document.getMainLongTitle() + ", indexText=" + Boolean.toString(indexText) + ", indexMedia=" + Boolean.toString(indexMedia));
0135:
0136: insertTextToWords(document.dc_source()
0137: .toNormalform(false, true), 0,
0138: indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS);
0139:
0140: Map.Entry<yacyURL, String> entry;
0141: if (indexText) {
0142: createCondensement(document.getText(), document
0143: .getCharset());
0144: // the phrase counter:
0145: // phrase 0 are words taken from the URL
0146: // phrase 1 is the MainTitle
0147: // phrase 2 is <not used>
0148: // phrase 3 is the Document Abstract
0149: // phrase 4 is the Document Author
0150: // phrase 5 are the tags specified in document
0151: // phrase 10 and above are the section headlines/titles (88 possible)
0152: // phrase 98 is taken from the embedded anchor/hyperlinks description
0153: // phrase 99 is taken from the media Link url and anchor description
0154: // phrase 100 and above are lines from the text
0155:
0156: insertTextToWords(document.dc_title(), 1,
0157: indexRWIEntry.flag_app_dc_title, RESULT_FLAGS);
0158: insertTextToWords(document.dc_description(), 3,
0159: indexRWIEntry.flag_app_dc_description, RESULT_FLAGS);
0160: insertTextToWords(document.dc_creator(), 4,
0161: indexRWIEntry.flag_app_dc_creator, RESULT_FLAGS);
0162: // missing: tags!
0163: String[] titles = document.getSectionTitles();
0164: for (int i = 0; i < titles.length; i++) {
0165: insertTextToWords(titles[i], i + 10,
0166: indexRWIEntry.flag_app_emphasized, RESULT_FLAGS);
0167: }
0168:
0169: // anchors
0170: Iterator<Map.Entry<yacyURL, String>> i = document
0171: .getAnchors().entrySet().iterator();
0172: while (i.hasNext()) {
0173: entry = i.next();
0174: if ((entry == null) || (entry.getKey() == null))
0175: continue;
0176: insertTextToWords(entry.getKey().toNormalform(false,
0177: false), 98,
0178: indexRWIEntry.flag_app_dc_identifier,
0179: RESULT_FLAGS);
0180: insertTextToWords((String) entry.getValue(), 98,
0181: indexRWIEntry.flag_app_dc_description,
0182: RESULT_FLAGS);
0183: }
0184: } else {
0185: this .RESULT_NUMB_WORDS = 0;
0186: this .RESULT_DIFF_WORDS = 0;
0187: this .RESULT_NUMB_SENTENCES = 0;
0188: this .RESULT_DIFF_SENTENCES = 0;
0189: }
0190:
0191: if (indexMedia) {
0192: // audio
0193: Iterator<Map.Entry<yacyURL, String>> i = document
0194: .getAudiolinks().entrySet().iterator();
0195: while (i.hasNext()) {
0196: entry = i.next();
0197: insertTextToWords(entry.getKey().toNormalform(false,
0198: false), 99, flag_cat_hasaudio, RESULT_FLAGS);
0199: insertTextToWords((String) entry.getValue(), 99,
0200: flag_cat_hasaudio, RESULT_FLAGS);
0201: }
0202:
0203: // video
0204: i = document.getVideolinks().entrySet().iterator();
0205: while (i.hasNext()) {
0206: entry = i.next();
0207: insertTextToWords(entry.getKey().toNormalform(false,
0208: false), 99, flag_cat_hasvideo, RESULT_FLAGS);
0209: insertTextToWords((String) entry.getValue(), 99,
0210: flag_cat_hasvideo, RESULT_FLAGS);
0211: }
0212:
0213: // applications
0214: i = document.getApplinks().entrySet().iterator();
0215: while (i.hasNext()) {
0216: entry = i.next();
0217: insertTextToWords(entry.getKey().toNormalform(false,
0218: false), 99, flag_cat_hasapp, RESULT_FLAGS);
0219: insertTextToWords((String) entry.getValue(), 99,
0220: flag_cat_hasapp, RESULT_FLAGS);
0221: }
0222:
0223: // images
0224: Iterator<htmlFilterImageEntry> j = document.getImages()
0225: .iterator();
0226: htmlFilterImageEntry ientry;
0227: while (j.hasNext()) {
0228: ientry = j.next();
0229: insertTextToWords(ientry.url().toNormalform(false,
0230: false), 99, flag_cat_hasimage, RESULT_FLAGS);
0231: insertTextToWords(ientry.alt(), 99, flag_cat_hasimage,
0232: RESULT_FLAGS);
0233: }
0234:
0235: // finally check all words for missing flag entry
0236: Iterator<Map.Entry<String, wordStatProp>> k = words
0237: .entrySet().iterator();
0238: wordStatProp wprop;
0239: Map.Entry<String, wordStatProp> we;
0240: while (k.hasNext()) {
0241: we = k.next();
0242: wprop = we.getValue();
0243: if (wprop.flags == null) {
0244: wprop.flags = (kelondroBitfield) RESULT_FLAGS
0245: .clone();
0246: words.put(we.getKey(), wprop);
0247: }
0248: }
0249: }
0250:
0251: // construct flag set for document
0252: if (document.getImages().size() > 0)
0253: RESULT_FLAGS.set(flag_cat_hasimage, true);
0254: if (document.getAudiolinks().size() > 0)
0255: RESULT_FLAGS.set(flag_cat_hasaudio, true);
0256: if (document.getVideolinks().size() > 0)
0257: RESULT_FLAGS.set(flag_cat_hasvideo, true);
0258: if (document.getApplinks().size() > 0)
0259: RESULT_FLAGS.set(flag_cat_hasapp, true);
0260: }
0261:
0262: private void insertTextToWords(String text, int phrase,
0263: int flagpos, kelondroBitfield flagstemplate) {
0264: String word;
0265: wordStatProp wprop;
0266: sievedWordsEnum wordenum;
0267: try {
0268: wordenum = new sievedWordsEnum(new ByteArrayInputStream(
0269: text.getBytes()), "UTF-8", 3);
0270: } catch (UnsupportedEncodingException e) {
0271: return;
0272: }
0273: int pip = 0;
0274: while (wordenum.hasMoreElements()) {
0275: word = (new String((StringBuffer) wordenum.nextElement()))
0276: .toLowerCase();
0277: wprop = (wordStatProp) words.get(word);
0278: if (wprop == null)
0279: wprop = new wordStatProp(0, pip, phrase);
0280: if (wprop.flags == null)
0281: wprop.flags = (kelondroBitfield) flagstemplate.clone();
0282: wprop.flags.set(flagpos, true);
0283: words.put(word, wprop);
0284: pip++;
0285: this .RESULT_NUMB_WORDS++;
0286: this .RESULT_DIFF_WORDS++;
0287: }
0288: }
0289:
0290: public plasmaCondenser(InputStream text, String charset)
0291: throws UnsupportedEncodingException {
0292: this (text, charset, 3, 2);
0293: }
0294:
0295: public plasmaCondenser(InputStream text, String charset,
0296: int wordminsize, int wordcut)
0297: throws UnsupportedEncodingException {
0298: this .wordminsize = wordminsize;
0299: this .wordcut = wordcut;
0300: // analysis = new Properties();
0301: words = new TreeMap<String, wordStatProp>();
0302: sentences = new HashMap<StringBuffer, phraseStatProp>();
0303: createCondensement(text, charset);
0304: }
0305:
0306: // create a word hash
0307: public static final String word2hash(String word) {
0308: return kelondroBase64Order.enhancedCoder.encode(
0309: serverCodings.encodeMD5Raw(word.toLowerCase()))
0310: .substring(0, yacySeedDB.commonHashLength);
0311: }
0312:
0313: public static final Set<String> words2hashSet(String[] words) {
0314: TreeSet<String> hashes = new TreeSet<String>(
0315: kelondroBase64Order.enhancedComparator);
0316: for (int i = 0; i < words.length; i++)
0317: hashes.add(word2hash(words[i]));
0318: return hashes;
0319: }
0320:
0321: public static final String words2hashString(String[] words) {
0322: StringBuffer sb = new StringBuffer();
0323: for (int i = 0; i < words.length; i++)
0324: sb.append(word2hash(words[i]));
0325: return new String(sb);
0326: }
0327:
0328: public static final TreeSet<String> words2hashes(Set<String> words) {
0329: Iterator<String> i = words.iterator();
0330: TreeSet<String> hashes = new TreeSet<String>(
0331: kelondroBase64Order.enhancedComparator);
0332: while (i.hasNext())
0333: hashes.add(word2hash(i.next()));
0334: return hashes;
0335: }
0336:
0337: public int excludeWords(TreeSet<String> stopwords) {
0338: // subtracts the given stopwords from the word list
0339: // the word list shrinkes. This returns the number of shrinked words
0340: int oldsize = words.size();
0341: words = kelondroMSetTools.excludeConstructive(words, stopwords);
0342: return oldsize - words.size();
0343: }
0344:
0345: public Map<String, wordStatProp> words() {
0346: // returns the words as word/wordStatProp relation map
0347: return words;
0348: }
0349:
0350: public Map<StringBuffer, phraseStatProp> sentences() {
0351: return sentences;
0352: }
0353:
0354: public static class wordStatProp {
0355: // object carries statistics for words and sentences
0356:
0357: public int count; // number of occurrences
0358: public int posInText; // unique handle, is initialized with word position (excluding double occurring words)
0359: public int posInPhrase; // position of word in phrase
0360: public int numOfPhrase; // number of phrase. 'normal' phrases begin with number 100
0361: private HashSet<Integer> hash; // a set of handles to all sentences where this word appears
0362: public kelondroBitfield flags; // the flag bits for each word
0363:
0364: public wordStatProp(int handle, int pip, int nop) {
0365: this .count = 1;
0366: this .posInText = handle;
0367: this .posInPhrase = pip;
0368: this .numOfPhrase = nop;
0369: this .hash = new HashSet<Integer>();
0370: this .flags = null;
0371: }
0372:
0373: public void inc() {
0374: count++;
0375: }
0376:
0377: public void check(int i) {
0378: hash.add(new Integer(i));
0379: }
0380:
0381: }
0382:
0383: public static class phraseStatProp {
0384: // object carries statistics for words and sentences
0385:
0386: public int count; // number of occurrences
0387: public int handle; // unique handle, is initialized with sentence counter
0388: private HashSet<Integer> hash; //
0389:
0390: public phraseStatProp(int handle) {
0391: this .count = 1;
0392: this .handle = handle;
0393: this .hash = new HashSet<Integer>();
0394: }
0395:
0396: public void inc() {
0397: count++;
0398: }
0399:
0400: public void check(int i) {
0401: hash.add(new Integer(i));
0402: }
0403:
0404: }
0405:
0406: public String intString(int number, int length) {
0407: String s = Integer.toString(number);
0408: while (s.length() < length)
0409: s = "0" + s;
0410: return s;
0411: }
0412:
0413: private void createCondensement(InputStream is, String charset)
0414: throws UnsupportedEncodingException {
0415: HashSet<String> currsentwords = new HashSet<String>();
0416: StringBuffer sentence = new StringBuffer(100);
0417: String word = "";
0418: String k;
0419: int wordlen;
0420: wordStatProp wsp, wsp1;
0421: phraseStatProp psp;
0422: int wordHandle;
0423: int wordHandleCount = 0;
0424: int sentenceHandleCount = 0;
0425: int allwordcounter = 0;
0426: int allsentencecounter = 0;
0427: int idx;
0428: int wordInSentenceCounter = 1;
0429: boolean comb_indexof = false, last_last = false, last_index = false;
0430: RandomAccessFile fa;
0431: final boolean dumpWords = false;
0432:
0433: if (dumpWords)
0434: try {
0435: fa = new RandomAccessFile(new File("dump.txt"), "rw");
0436: fa.seek(fa.length());
0437: } catch (IOException e) {
0438: e.printStackTrace();
0439: fa = null;
0440: }
0441:
0442: // read source
0443: sievedWordsEnum wordenum = new sievedWordsEnum(is, charset,
0444: wordminsize);
0445: while (wordenum.hasMoreElements()) {
0446: word = (new String((StringBuffer) wordenum.nextElement()))
0447: .toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
0448: //System.out.println("PARSED-WORD " + word);
0449:
0450: //This is useful for testing what YaCy "sees" of a website.
0451: if (dumpWords && fa != null)
0452: try {
0453: fa.writeBytes(word);
0454: fa.write(160);
0455: } catch (IOException e) {
0456: e.printStackTrace();
0457: }
0458:
0459: // distinguish punctuation and words
0460: wordlen = word.length();
0461: Iterator<String> it;
0462: if ((wordlen == 1)
0463: && (htmlFilterContentScraper.punctuation(word
0464: .charAt(0)))) {
0465: // store sentence
0466: if (sentence.length() > 0) {
0467: // we store the punctuation symbol as first element of the sentence vector
0468: allsentencecounter++;
0469: sentence.insert(0, word); // append at beginning
0470: if (sentences.containsKey(sentence)) {
0471: // sentence already exists
0472: psp = (phraseStatProp) sentences.get(sentence);
0473: psp.inc();
0474: idx = psp.handle;
0475: sentences.put(sentence, psp);
0476: } else {
0477: // create new sentence
0478: idx = sentenceHandleCount++;
0479: sentences
0480: .put(sentence, new phraseStatProp(idx));
0481: }
0482: // store to the words a link to this sentence
0483: it = currsentwords.iterator();
0484: while (it.hasNext()) {
0485: k = (String) it.next();
0486: wsp = (wordStatProp) words.get(k);
0487: wsp.check(idx);
0488: words.put(k, wsp);
0489: }
0490: }
0491: sentence = new StringBuffer(100);
0492: currsentwords.clear();
0493: wordInSentenceCounter = 1;
0494: } else {
0495: // check index.of detection
0496: if ((last_last) && (comb_indexof)
0497: && (word.equals("modified"))) {
0498: this .RESULT_FLAGS.set(flag_cat_indexof, true);
0499: wordenum.pre(true); // parse lines as they come with CRLF
0500: }
0501: if ((last_index) && (word.equals("of")))
0502: comb_indexof = true;
0503: last_last = word.equals("last");
0504: last_index = word.equals("index");
0505:
0506: // store word
0507: allwordcounter++;
0508: currsentwords.add(word);
0509: if (words.containsKey(word)) {
0510: // word already exists
0511: wsp = (wordStatProp) words.get(word);
0512: wordHandle = wsp.posInText;
0513: wsp.inc();
0514: } else {
0515: // word does not yet exist, create new word entry
0516: wordHandle = wordHandleCount++;
0517: wsp = new wordStatProp(wordHandle,
0518: wordInSentenceCounter,
0519: sentences.size() + 100);
0520: wsp.flags = (kelondroBitfield) RESULT_FLAGS.clone();
0521: }
0522: words.put(word, wsp);
0523: // we now have the unique handle of the word, put it into the sentence:
0524: sentence.append(intString(wordHandle, numlength));
0525: wordInSentenceCounter++;
0526: }
0527: }
0528: // finish last sentence
0529: if (sentence.length() > 0) {
0530: allsentencecounter++;
0531: sentence.insert(0, "."); // append at beginning
0532: if (sentences.containsKey(sentence)) {
0533: psp = (phraseStatProp) sentences.get(sentence);
0534: psp.inc();
0535: sentences.put(sentence, psp);
0536: } else {
0537: sentences.put(sentence, new phraseStatProp(
0538: sentenceHandleCount++));
0539: }
0540: }
0541:
0542: if (dumpWords && fa != null)
0543: try {
0544: fa.write('\n');
0545: fa.close();
0546: } catch (IOException e) {
0547: e.printStackTrace();
0548: }
0549:
0550: // -------------------
0551:
0552: // we reconstruct the sentence hashtable
0553: // and order the entries by the number of the sentence
0554: // this structure is needed to replace double occurring words in sentences
0555: Object[] orderedSentences = new Object[sentenceHandleCount];
0556: String[] s;
0557: int wc;
0558: Object o;
0559: Iterator<StringBuffer> sit = sentences.keySet().iterator();
0560: while (sit.hasNext()) {
0561: o = sit.next();
0562: if (o != null) {
0563: sentence = (StringBuffer) o;
0564: wc = (sentence.length() - 1) / numlength;
0565: s = new String[wc + 2];
0566: psp = (phraseStatProp) sentences.get(sentence);
0567: s[0] = intString(psp.count, numlength); // number of occurrences of this sentence
0568: s[1] = sentence.substring(0, 1); // the termination symbol of this sentence
0569: for (int i = 0; i < wc; i++) {
0570: k = sentence.substring(i * numlength + 1, (i + 1)
0571: * numlength + 1);
0572: s[i + 2] = k;
0573: }
0574: orderedSentences[psp.handle] = s;
0575: }
0576: }
0577:
0578: Map.Entry<String, wordStatProp> entry;
0579: // we search for similar words and reorganize the corresponding sentences
0580: // a word is similar, if a shortened version is equal
0581: Iterator<Map.Entry<String, wordStatProp>> wi = words.entrySet()
0582: .iterator(); // enumerates the keys in descending order
0583: wordsearch: while (wi.hasNext()) {
0584: entry = wi.next();
0585: word = entry.getKey();
0586: wordlen = word.length();
0587: wsp = entry.getValue();
0588: for (int i = wordcut; i > 0; i--) {
0589: if (wordlen > i) {
0590: k = word.substring(0, wordlen - i);
0591: if (words.containsKey(k)) {
0592: // we will delete the word 'word' and repoint the
0593: // corresponding links
0594: // in sentences that use this word
0595: wsp1 = (wordStatProp) words.get(k);
0596: Iterator<Integer> it1 = wsp.hash.iterator(); // we iterate over all sentences that refer to this word
0597: while (it1.hasNext()) {
0598: idx = it1.next().intValue(); // number of a sentence
0599: s = (String[]) orderedSentences[idx];
0600: for (int j = 2; j < s.length; j++) {
0601: if (s[j].equals(intString(
0602: wsp.posInText, numlength)))
0603: s[j] = intString(wsp1.posInText,
0604: numlength);
0605: }
0606: orderedSentences[idx] = s;
0607: }
0608: // update word counter
0609: wsp1.count = wsp1.count + wsp.count;
0610: words.put(k, wsp1);
0611: // remove current word
0612: wi.remove();
0613: continue wordsearch;
0614: }
0615: }
0616: }
0617: }
0618:
0619: // depending on the orderedSentences structure, we rebuild the sentence
0620: // HashMap to eliminate double occurring sentences
0621: sentences = new HashMap<StringBuffer, phraseStatProp>();
0622: int le;
0623: for (int i = 0; i < orderedSentences.length; i++) {
0624: le = ((String[]) orderedSentences[i]).length;
0625: sentence = new StringBuffer(le * 10);
0626: for (int j = 1; j < le; j++)
0627: sentence.append(((String[]) orderedSentences[i])[j]);
0628: if (sentences.containsKey(sentence)) {
0629: // add sentence counter to counter of found sentence
0630: psp = sentences.get(sentence);
0631: psp.count = psp.count
0632: + Integer
0633: .parseInt(((String[]) orderedSentences[i])[0]);
0634: sentences.put(sentence, psp);
0635: // System.out.println("Found double occurring sentence " + i + "
0636: // = " + sp.handle);
0637: } else {
0638: // create new sentence entry
0639: psp = new phraseStatProp(i);
0640: psp.count = Integer
0641: .parseInt(((String[]) orderedSentences[i])[0]);
0642: sentences.put(sentence, psp);
0643: }
0644: }
0645:
0646: // store result
0647: //this.RESULT_NUMB_TEXT_BYTES = wordenum.count();
0648: this .RESULT_NUMB_WORDS = allwordcounter;
0649: this .RESULT_DIFF_WORDS = wordHandleCount;
0650: this .RESULT_NUMB_SENTENCES = allsentencecounter;
0651: this .RESULT_DIFF_SENTENCES = sentenceHandleCount;
0652: }
0653:
0654: public void print() {
0655: String[] s = sentenceReconstruction();
0656:
0657: // printout a reconstruction of the text
0658: for (int i = 0; i < s.length; i++) {
0659: if (s[i] != null)
0660: System.out.print("#T " + intString(i, numlength) + " "
0661: + s[i]);
0662: }
0663: }
0664:
0665: private String[] sentenceReconstruction() {
0666: // we reconstruct the word hashtable
0667: // and order the entries by the number of the sentence
0668: // this structure is only needed to reconstruct the text
0669: String word;
0670: wordStatProp wsp;
0671: Map.Entry<String, wordStatProp> entry;
0672: Iterator<Map.Entry<String, wordStatProp>> it;
0673: String[] orderedWords = new String[words.size() + 99]; // uuiiii, the '99' is only a quick hack...
0674: it = words.entrySet().iterator(); // enumerates the keys in ascending order
0675: while (it.hasNext()) {
0676: entry = it.next();
0677: word = entry.getKey();
0678: wsp = entry.getValue();
0679: orderedWords[wsp.posInText] = word;
0680: }
0681:
0682: Object[] orderedSentences = makeOrderedSentences();
0683:
0684: // create a reconstruction of the text
0685: String[] result = new String[orderedSentences.length];
0686: String s;
0687: for (int i = 0; i < orderedSentences.length; i++) {
0688: if (orderedSentences[i] != null) {
0689: // TODO: bugfix for UTF-8: avoid this form of string concatenation
0690: s = "";
0691: for (int j = 2; j < ((String[]) orderedSentences[i]).length; j++) {
0692: s += " "
0693: + orderedWords[Integer
0694: .parseInt(((String[]) orderedSentences[i])[j])];
0695: }
0696: s += ((String[]) orderedSentences[i])[1];
0697: result[i] = (s.length() > 1) ? s.substring(1) : s;
0698: } else {
0699: result[i] = "";
0700: }
0701: }
0702: return result;
0703: }
0704:
0705: private Object[] makeOrderedSentences() {
0706: // we reconstruct the sentence hashtable again and create by-handle ordered entries
0707: // this structure is needed to present the strings in the right order in a printout
0708: int wc;
0709: phraseStatProp psp;
0710: String[] s;
0711: StringBuffer sentence;
0712: Object[] orderedSentences = new Object[sentences.size()];
0713: for (int i = 0; i < sentences.size(); i++) {
0714: orderedSentences[i] = null; // this array must be initialized
0715: }
0716: Iterator<StringBuffer> it = sentences.keySet().iterator();
0717: while (it.hasNext()) {
0718: sentence = (StringBuffer) it.next();
0719: wc = (sentence.length() - 1) / numlength;
0720: s = new String[wc + 2];
0721: psp = (phraseStatProp) sentences.get(sentence);
0722: s[0] = intString(psp.count, numlength); // number of occurrences of this sentence
0723: s[1] = sentence.substring(0, 1); // the termination symbol of this sentence
0724: for (int i = 0; i < wc; i++)
0725: s[i + 2] = sentence.substring(i * numlength + 1,
0726: (i + 1) * numlength + 1);
0727: orderedSentences[psp.handle] = s;
0728: }
0729: return orderedSentences;
0730: }
0731:
0732: public final static boolean invisible(char c) {
0733: // TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
0734: if ((c < ' ') || (c > 'z'))
0735: return true;
0736: return ("$%&/()=\"$%&/()=`^+*~#'-_:;,|<>[]\\".indexOf(c) >= 0);
0737: }
0738:
0739: public static Enumeration<StringBuffer> wordTokenizer(String s,
0740: String charset, int minLength) {
0741: try {
0742: return new sievedWordsEnum(new ByteArrayInputStream(s
0743: .getBytes()), charset, minLength);
0744: } catch (Exception e) {
0745: return null;
0746: }
0747: }
0748:
0749: public static class sievedWordsEnum implements
0750: Enumeration<StringBuffer> {
0751: // this enumeration removes all words that contain either wrong characters or are too short
0752:
0753: StringBuffer buffer = null;
0754: unsievedWordsEnum e;
0755: int ml;
0756:
0757: public sievedWordsEnum(InputStream is, String charset,
0758: int minLength) throws UnsupportedEncodingException {
0759: e = new unsievedWordsEnum(is, charset);
0760: buffer = nextElement0();
0761: ml = minLength;
0762: }
0763:
0764: public void pre(boolean x) {
0765: e.pre(x);
0766: }
0767:
0768: private StringBuffer nextElement0() {
0769: StringBuffer s;
0770: char c;
0771: loop: while (e.hasMoreElements()) {
0772: s = (StringBuffer) e.nextElement();
0773: if ((s.length() == 1)
0774: && (htmlFilterContentScraper.punctuation(s
0775: .charAt(0))))
0776: return s;
0777: if ((s.length() < ml) && (!(s.equals("of"))))
0778: continue loop;
0779: for (int i = 0; i < s.length(); i++) {
0780: c = s.charAt(i);
0781: // TODO: Bugfix needed for UTF-8
0782: if (((c < 'a') || (c > 'z'))
0783: && ((c < 'A') || (c > 'Z'))
0784: && ((c < '0') || (c > '9')))
0785: continue loop; // go to next while loop
0786: }
0787: return s;
0788: }
0789: return null;
0790: }
0791:
0792: public boolean hasMoreElements() {
0793: return buffer != null;
0794: }
0795:
0796: public StringBuffer nextElement() {
0797: StringBuffer r = buffer;
0798: buffer = nextElement0();
0799: return r;
0800: }
0801:
0802: }
0803:
0804: private static class unsievedWordsEnum implements
0805: Enumeration<StringBuffer> {
0806: // returns an enumeration of StringBuffer Objects
0807: StringBuffer buffer = null;
0808: sentencesFromInputStreamEnum e;
0809: StringBuffer s;
0810:
0811: public unsievedWordsEnum(InputStream is, String charset)
0812: throws UnsupportedEncodingException {
0813: e = new sentencesFromInputStreamEnum(is, charset);
0814: s = new StringBuffer();
0815: buffer = nextElement0();
0816: }
0817:
0818: public void pre(boolean x) {
0819: e.pre(x);
0820: }
0821:
0822: private StringBuffer nextElement0() {
0823: StringBuffer r;
0824: StringBuffer sb;
0825: char c;
0826: while (s.length() == 0) {
0827: if (e.hasNext()) {
0828: r = (StringBuffer) e.next();
0829: if (r == null)
0830: return null;
0831: r = trim(r);
0832: sb = new StringBuffer(r.length() * 2);
0833: for (int i = 0; i < r.length(); i++) {
0834: c = r.charAt(i);
0835: if (invisible(c))
0836: sb = sb.append(' '); // TODO: Bugfix needed for UTF-8
0837: else if (htmlFilterContentScraper
0838: .punctuation(c))
0839: sb = sb.append(' ').append(c).append(' ');
0840: else
0841: sb = sb.append(c);
0842: }
0843: s = trim(sb);
0844: //System.out.println("PARSING-LINE '" + r + "'->'" + s + "'");
0845: } else {
0846: return null;
0847: }
0848: }
0849: int p = s.indexOf(" ");
0850: if (p < 0) {
0851: r = s;
0852: s = new StringBuffer();
0853: return r;
0854: }
0855: r = trim(new StringBuffer(s.substring(0, p)));
0856: s = trim(s.delete(0, p + 1));
0857: return r;
0858: }
0859:
0860: public boolean hasMoreElements() {
0861: return buffer != null;
0862: }
0863:
0864: public StringBuffer nextElement() {
0865: StringBuffer r = buffer;
0866: buffer = nextElement0();
0867: return r;
0868: }
0869:
0870: }
0871:
0872: public static StringBuffer trim(StringBuffer sb) {
0873: synchronized (sb) {
0874: while ((sb.length() > 0) && (sb.charAt(0) <= ' '))
0875: sb = sb.deleteCharAt(0);
0876: while ((sb.length() > 0)
0877: && (sb.charAt(sb.length() - 1) <= ' '))
0878: sb = sb.deleteCharAt(sb.length() - 1);
0879: }
0880: return sb;
0881: }
0882:
0883: public static sentencesFromInputStreamEnum sentencesFromInputStream(
0884: InputStream is, String charset) {
0885: try {
0886: return new sentencesFromInputStreamEnum(is, charset);
0887: } catch (UnsupportedEncodingException e) {
0888: return null;
0889: }
0890: }
0891:
0892: public static class sentencesFromInputStreamEnum implements
0893: Iterator<StringBuffer> {
0894: // read sentences from a given input stream
0895: // this enumerates StringBuffer objects
0896:
0897: StringBuffer buffer = null;
0898: BufferedReader raf;
0899: int counter = 0;
0900: boolean pre = false;
0901:
0902: public sentencesFromInputStreamEnum(InputStream is,
0903: String charset) throws UnsupportedEncodingException {
0904: raf = new BufferedReader(
0905: (charset == null) ? new InputStreamReader(is)
0906: : new InputStreamReader(is, charset));
0907: buffer = nextElement0();
0908: counter = 0;
0909: pre = false;
0910: }
0911:
0912: public void pre(boolean x) {
0913: this .pre = x;
0914: }
0915:
0916: private StringBuffer nextElement0() {
0917: try {
0918: StringBuffer s = readSentence(raf, pre);
0919: //System.out.println(" SENTENCE='" + s + "'"); // DEBUG
0920: if (s == null) {
0921: raf.close();
0922: return null;
0923: }
0924: return s;
0925: } catch (IOException e) {
0926: try {
0927: raf.close();
0928: } catch (Exception ee) {
0929: }
0930: return null;
0931: }
0932: }
0933:
0934: public boolean hasNext() {
0935: return buffer != null;
0936: }
0937:
0938: public StringBuffer next() {
0939: if (buffer == null) {
0940: return null;
0941: } else {
0942: counter = counter + buffer.length() + 1;
0943: StringBuffer r = buffer;
0944: buffer = nextElement0();
0945: return r;
0946: }
0947: }
0948:
0949: public int count() {
0950: return counter;
0951: }
0952:
0953: public void remove() {
0954: throw new UnsupportedOperationException();
0955: }
0956: }
0957:
0958: static StringBuffer readSentence(Reader reader, boolean pre)
0959: throws IOException {
0960: StringBuffer s = new StringBuffer();
0961: int nextChar;
0962: char c;
0963:
0964: // find sentence end
0965: for (;;) {
0966: nextChar = reader.read();
0967: //System.out.print((char) nextChar); // DEBUG
0968: if (nextChar < 0) {
0969: if (s.length() == 0)
0970: return null;
0971: else
0972: break;
0973: }
0974: c = (char) nextChar;
0975: s.append(c);
0976: if (pre) {
0977: if ((c == (char) 10) || (c == (char) 13))
0978: break;
0979: } else {
0980: if (htmlFilterContentScraper.punctuation(c))
0981: break;
0982: }
0983: }
0984:
0985: // replace line endings and tabs by blanks
0986: for (int i = 0; i < s.length(); i++) {
0987: if ((s.charAt(i) == (char) 10)
0988: || (s.charAt(i) == (char) 13)
0989: || (s.charAt(i) == (char) 8))
0990: s.setCharAt(i, ' ');
0991: }
0992: // remove all double-spaces
0993: int p;
0994: while ((p = s.indexOf(" ")) >= 0)
0995: s.deleteCharAt(p);
0996: return s;
0997: }
0998:
0999: public static Map<String, wordStatProp> getWords(byte[] text,
1000: String charset) throws UnsupportedEncodingException {
1001: // returns a word/wordStatProp relation map
1002: if (text == null)
1003: return null;
1004: ByteArrayInputStream buffer = new ByteArrayInputStream(text);
1005: return new plasmaCondenser(buffer, charset, 2, 1).words();
1006: }
1007:
1008: public static Map<String, wordStatProp> getWords(String text) {
1009: // returns a word/wordStatProp relation map
1010: if (text == null)
1011: return null;
1012: ByteArrayInputStream buffer = new ByteArrayInputStream(text
1013: .getBytes());
1014: try {
1015: return new plasmaCondenser(buffer, "UTF-8", 2, 1).words();
1016: } catch (UnsupportedEncodingException e) {
1017: return null;
1018: }
1019: }
1020:
1021: public static void main(String[] args) {
1022: // read a property file and convert them into configuration lines
1023: try {
1024: File f = new File(args[0]);
1025: Properties p = new Properties();
1026: p.load(new FileInputStream(f));
1027: StringBuffer sb = new StringBuffer();
1028: sb.append("{\n");
1029: for (int i = 0; i <= 15; i++) {
1030: sb.append('"');
1031: String s = p.getProperty("keywords" + i);
1032: String[] l = s.split(",");
1033: for (int j = 0; j < l.length; j++) {
1034: sb.append(word2hash(l[j]));
1035: }
1036: if (i < 15)
1037: sb.append(",\n");
1038: }
1039: sb.append("}\n");
1040: System.out.println(new String(sb));
1041: } catch (FileNotFoundException e) {
1042: e.printStackTrace();
1043: } catch (IOException e) {
1044: e.printStackTrace();
1045: }
1046:
1047: }
1048:
1049: }
|