001: /**********************************************************************************
002: * $URL: https://source.sakaiproject.org/svn/search/tags/sakai_2-4-1/search-api/api/src/java/org/sakaiproject/search/api/SearchUtils.java $
003: * $Id: SearchUtils.java 21387 2007-02-11 19:37:04Z ian@caret.cam.ac.uk $
004: ***********************************************************************************
005: *
006: * Copyright (c) 2006 The Sakai Foundation.
007: *
008: * Licensed under the Educational Community License, Version 1.0 (the "License");
009: * you may not use this file except in compliance with the License.
010: * You may obtain a copy of the License at
011: *
012: * http://www.opensource.org/licenses/ecl1.php
013: *
014: * Unless required by applicable law or agreed to in writing, software
015: * distributed under the License is distributed on an "AS IS" BASIS,
016: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017: * See the License for the specific language governing permissions and
018: * limitations under the License.
019: *
020: **********************************************************************************/package org.sakaiproject.search.api;
021:
022: public class SearchUtils {
023: public static String getCleanStringXX(String text) {
024: text = text
025: .replaceAll(
026: "[\\x00-\\x08\\x0b\\x0c\\x0e-\\x1f\\ud800-\\udfff\\uffff\\ufffe]",
027: "");
028: return text;
029: }
030:
031: /**
032: * @param string
033: * @param sb
034: * @param minWordLength
035: * @return
036: */
037: public static StringBuilder filterWordLengthIgnore(String string,
038: StringBuilder sb, int minWordLength) {
039: if (sb == null) {
040: sb = new StringBuilder();
041: }
042: if (true) {
043: sb.append(string);
044: return sb;
045: }
046: if (minWordLength == -1) {
047: sb.append(string);
048: return sb;
049: }
050: char[] content = string.toCharArray();
051: int startOfWord = -1;
052: boolean symbol = false;
053: for (int i = 0; i < content.length; i++) {
054: // only take words longer than 3 charaters
055: // if ( isIdiom(content[i]) ) {
056: // symbol = true;
057: // }
058: if (Character.isWhitespace(content[i])) {
059: if (startOfWord != -1
060: && (symbol || (i - startOfWord) > minWordLength)) {
061: if (!symbol
062: || Character
063: .isWhitespace(content[startOfWord])) {
064: content[startOfWord] = ' ';
065: } else if ((sb.length() > 0)
066: && sb.charAt(sb.length() - 1) != ' ') {
067: sb.append(' ');
068: }
069: String word = new String(content, startOfWord, i
070: - startOfWord);
071: sb.append(word);
072: }
073: symbol = false;
074: startOfWord = i;
075: } else {
076: if (startOfWord == -1) {
077: startOfWord = i - 1;
078: if (startOfWord == -1) {
079: startOfWord = 0;
080: }
081: }
082: }
083: }
084: if (startOfWord != -1
085: && (content.length - startOfWord - 1) > minWordLength) {
086: if (Character.isWhitespace(content[startOfWord])) {
087: content[startOfWord] = ' ';
088: }
089: String word = new String(content, startOfWord,
090: content.length - startOfWord);
091: sb.append(word).append(" ");
092: }
093: return sb;
094: }
095:
096: /**
097: * @param string
098: * @param sb
099: */
100: public static StringBuilder appendCleanString(String string,
101: StringBuilder sb) {
102: if (sb == null) {
103: sb = new StringBuilder();
104: }
105: boolean ignore = true;
106: char[] content = string.toCharArray();
107: for (int i = 0; i < content.length; i++) {
108: char c = content[i];
109: if (Character.isWhitespace(c) || Character.isISOControl(c)
110: || (c == 160) || (c >= 0x00 && c <= 0x08)
111: || (c == 0x0b) || (c == 0x0c)
112: || (c == 0x0e && c <= 0x1f)
113: || (c >= 0xd800 && c <= 0xdfff) || (c == 0xffff)
114: || (c == 0xfffe)) {
115: ignore = true;
116: } else {
117: if (ignore) {
118: sb.append(" ");
119: ignore = false;
120: }
121: sb.append(c);
122: }
123: }
124:
125: return sb;
126: }
127:
128: }
|