001: /*
002: * This file is part of DrFTPD, Distributed FTP Daemon.
003: *
004: * DrFTPD is free software; you can redistribute it and/or modify
005: * it under the terms of the GNU General Public License as published by
006: * the Free Software Foundation; either version 2 of the License, or
007: * (at your option) any later version.
008: *
009: * DrFTPD is distributed in the hope that it will be useful,
010: * but WITHOUT ANY WARRANTY; without even the implied warranty of
011: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
012: * GNU General Public License for more details.
013: *
014: * You should have received a copy of the GNU General Public License
015: * along with DrFTPD; if not, write to the Free Software
016: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
017: */
018: package net.drmods.plugins.irc.imdb;
019:
020: import java.io.BufferedReader;
021: import java.io.IOException;
022: import java.io.InputStream;
023: import java.io.InputStreamReader;
024: import java.net.MalformedURLException;
025: import java.net.URL;
026: import java.net.URLConnection;
027: import java.util.regex.Matcher;
028: import java.util.regex.Pattern;
029:
030: import org.apache.log4j.Logger;
031: import org.drftpd.plugins.SiteBot;
032: import org.tanesha.replacer.ReplacerEnvironment;
033:
034: /**
035: * @author Teflon
036: */
037: public class IMDBParser {
038: private static final Logger logger = Logger
039: .getLogger(IMDBParser.class);
040: private String[] _seperators = { ".", "-", "_" };
041: private String[] _filters;
042:
043: private static final String _baseUrl = "http://www.imdb.com";
044: private static final String _searchUrl = "http://www.imdb.com/find?tt=on;nm=on;mx=5;q=";
045:
046: private boolean _foundFilm;
047:
048: private String _title;
049: private String _genre;
050: private String _plot;
051: private String _votes;
052: private String _rating;
053: private String _year;
054: private String _url;
055:
056: public IMDBParser(String searchStr, String filters) {
057: _filters = filters.split(";");
058: _foundFilm = getInfo(searchStr);
059: }
060:
061: public String getGenre() {
062: return foundFilm() ? _genre : "N/A";
063: }
064:
065: public String getPlot() {
066: return foundFilm() ? _plot : "N/A";
067: }
068:
069: public String getRating() {
070: return foundFilm() ? _rating : "N/A";
071: }
072:
073: public String getTitle() {
074: return foundFilm() ? _title : "N/A";
075: }
076:
077: public String getVotes() {
078: return foundFilm() ? _votes : "N/A";
079: }
080:
081: public String getYear() {
082: return foundFilm() ? _year : "N/A";
083: }
084:
085: public String getURL() {
086: return foundFilm() ? _url : "N/A";
087: }
088:
089: public boolean foundFilm() {
090: return _foundFilm;
091: }
092:
093: private boolean getInfo(String searchString) {
094: try {
095: String urlString = _searchUrl + filterTitle(searchString);
096:
097: URL url = new URL(urlString);
098: URLConnection urlConn = url.openConnection();
099:
100: if (!(urlConn.getContent() instanceof InputStream))
101: return false;
102:
103: boolean redirect = false;
104: String data = "";
105: BufferedReader in = null;
106: String line;
107: try {
108: in = new BufferedReader(new InputStreamReader(urlConn
109: .getInputStream()));
110: while ((line = in.readLine()) != null) {
111: data += line + "\n";
112: }
113: } finally {
114: if (in != null) {
115: in.close();
116: }
117: }
118:
119: if (data.indexOf("<b>No Matches.</b>") > 0)
120: return false;
121:
122: if (data.indexOf("<title>IMDb name and title search") >= 0
123: || data.indexOf("<a href=\"/title/tt") >= 0) {
124: int start = data.indexOf("/title/tt");
125: if (start > 0) {
126: int end = data.indexOf("/", start
127: + "/title/tt".length());
128: _url = data.substring(start, end);
129: if (_url.indexOf("http://") < 0)
130: _url = _baseUrl + _url;
131: }
132: if (_url == null)
133: return false;
134: } else {
135: _url = urlString;
136: }
137:
138: url = new URL(_url);
139: urlConn = url.openConnection();
140: if (!(urlConn.getContent() instanceof InputStream))
141: return false;
142: try {
143: in = new BufferedReader(new InputStreamReader(urlConn
144: .getInputStream()));
145:
146: while ((line = in.readLine()) != null)
147: data = data + line + "\n";
148: } finally {
149: in.close();
150: }
151:
152: _title = parseData(data, "<strong class=\"title\">",
153: "<small>");
154: _genre = parseData(data, "<b class=\"ch\">Genre:</b>",
155: "<br><br>");
156: _genre = _genre.replaceAll("\\(more\\)", "").trim();
157: _plot = parseData(data,
158: "<b class=\"ch\">Plot Outline:</b>", "<a href=\"");
159: _rating = parseData(data,
160: "<b class=\"ch\">User Rating:</b>", "</b>");
161: _rating = _rating.equals("N/A") || _rating.indexOf("/") < 0 ? "N/A"
162: : _rating.substring(0, _rating.indexOf("/"));
163: _votes = parseData(data,
164: "<b class=\"ch\">User Rating:</b>", "<br><br>");
165: _votes = _votes.indexOf("(") < 0
166: || _votes.indexOf("votes") < 0 ? "N/A" : _votes
167: .substring(_votes.indexOf("(") + 1,
168: _votes.indexOf("votes")).trim();
169: _year = parseData(data, "<a href=\"/Sections/Years/",
170: "</a>");
171: if (_year.length() >= 6)
172: _year = _year.substring(6);
173:
174: } catch (MalformedURLException e) {
175: e.printStackTrace();
176: return false;
177: } catch (IOException e) {
178: e.printStackTrace();
179: return false;
180: }
181:
182: return true;
183: }
184:
185: public ReplacerEnvironment getEnv() {
186: ReplacerEnvironment env = new ReplacerEnvironment(
187: SiteBot.GLOBAL_ENV);
188: env.add("title", getTitle());
189: env.add("genre", getGenre());
190: env.add("plot", getPlot());
191: env.add("rating", getRating());
192: env.add("votes", getVotes());
193: env.add("year", getYear());
194: env.add("url", getURL());
195: return env;
196: }
197:
198: private String filterTitle(String title) {
199: String newTitle = title.toLowerCase();
200:
201: //remove the group name
202: if (newTitle.lastIndexOf("-") >= 0)
203: newTitle = newTitle.substring(0, newTitle.lastIndexOf("-"));
204:
205: //remove seperators
206: for (int i = 0; i < _seperators.length; i++)
207: newTitle = newTitle.replaceAll("\\"
208: + _seperators[i].toLowerCase(), " ");
209:
210: //remove filtered words
211: for (int i = 0; i < _filters.length; i++)
212: newTitle = newTitle.replaceAll("\\b"
213: + _filters[i].toLowerCase() + "\\b", "");
214:
215: //remove extra spaces
216: while (newTitle.indexOf(" ") > 0)
217: newTitle = newTitle.replaceAll(" ", " ");
218:
219: //convert spaces to +
220: newTitle = newTitle.trim().replaceAll("\\s", "+");
221:
222: return newTitle;
223: }
224:
225: private String parseData(String data, String startText,
226: String endText) {
227: int start, end;
228: start = data.indexOf(startText);
229: if (start > 0) {
230: start = start + startText.length();
231: end = data.indexOf(endText, start);
232: return htmlToString(data.substring(start, end)).trim();
233: }
234: return "N/A";
235: }
236:
237: private String htmlToString(String input) {
238: String str = input.replaceAll("\n", "");
239: while (str.indexOf("<") != -1) {
240: int startPos = str.indexOf("<");
241: int endPos = str.indexOf(">", startPos);
242: if (endPos > startPos) {
243: String beforeTag = str.substring(0, startPos);
244: String afterTag = str.substring(endPos + 1);
245: str = beforeTag + afterTag;
246: }
247: }
248:
249: String mbChar;
250: String mbs = "&#(\\d+);";
251: StringBuffer sb = new StringBuffer();
252: Pattern pat = Pattern.compile(mbs);
253: Matcher mat = pat.matcher(str);
254:
255: while (mat.find()) {
256: mbChar = getMbCharStr(mat.group(1));
257: mat.appendReplacement(sb, mbChar);
258: }
259: mat.appendTail(sb);
260: return new String(sb);
261: }
262:
263: private String getMbCharStr(String digits) {
264: char[] cha = new char[1];
265:
266: try {
267: int val = Integer.parseInt(digits);
268: char ch = (char) val;
269: cha[0] = ch;
270: } catch (Exception e) {
271: System.err.println("Error from getMbCharStr:");
272: e.printStackTrace(System.err);
273: }
274: return new String(cha);
275: }
276: }
|