001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.commons.validator;
018:
019: import java.io.Serializable;
020: import java.util.Arrays;
021: import java.util.HashSet;
022: import java.util.Set;
023:
024: import org.apache.commons.validator.util.Flags;
025: import org.apache.oro.text.perl.Perl5Util;
026:
027: /**
028: * <p>Validates URLs.</p>
029: * Behavour of validation is modified by passing in options:
030: * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path
031: * component.</li>
032: * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is
033: * included then fragments are flagged as illegal.</li>
034: * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are
035: * considered valid schemes. Enabling this option will let any scheme pass validation.</li>
036: *
037: * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02,
038: * http://javascript.internet.com. However, this validation now bears little resemblance
039: * to the php original.</p>
040: * <pre>
041: * Example of usage:
042: * Construct a UrlValidator with valid schemes of "http", and "https".
043: *
044: * String[] schemes = {"http","https"}.
045: * UrlValidator urlValidator = new UrlValidator(schemes);
046: * if (urlValidator.isValid("ftp://foo.bar.com/")) {
047: * System.out.println("url is valid");
048: * } else {
049: * System.out.println("url is invalid");
050: * }
051: *
052: * prints "url is invalid"
053: * If instead the default constructor is used.
054: *
055: * UrlValidator urlValidator = new UrlValidator();
056: * if (urlValidator.isValid("ftp://foo.bar.com/")) {
057: * System.out.println("url is valid");
058: * } else {
059: * System.out.println("url is invalid");
060: * }
061: *
062: * prints out "url is valid"
063: * </pre>
064: *
065: * @see
066: * <a href='http://www.ietf.org/rfc/rfc2396.txt' >
067: * Uniform Resource Identifiers (URI): Generic Syntax
068: * </a>
069: *
070: * @version $Revision: 478334 $ $Date: 2006-11-22 21:31:54 +0000 (Wed, 22 Nov 2006) $
071: * @since Validator 1.1
072: */
073: public class UrlValidator implements Serializable {
074:
075: /**
076: * Allows all validly formatted schemes to pass validation instead of
077: * supplying a set of valid schemes.
078: */
079: public static final int ALLOW_ALL_SCHEMES = 1 << 0;
080:
081: /**
082: * Allow two slashes in the path component of the URL.
083: */
084: public static final int ALLOW_2_SLASHES = 1 << 1;
085:
086: /**
087: * Enabling this options disallows any URL fragments.
088: */
089: public static final int NO_FRAGMENTS = 1 << 2;
090:
091: private static final String ALPHA_CHARS = "a-zA-Z";
092:
093: private static final String ALPHA_NUMERIC_CHARS = ALPHA_CHARS
094: + "\\d";
095:
096: private static final String SPECIAL_CHARS = ";/@&=,.?:+$";
097:
098: private static final String VALID_CHARS = "[^\\s" + SPECIAL_CHARS
099: + "]";
100:
101: private static final String SCHEME_CHARS = ALPHA_CHARS;
102:
103: // Drop numeric, and "+-." for now
104: private static final String AUTHORITY_CHARS = ALPHA_NUMERIC_CHARS
105: + "\\-\\.";
106:
107: private static final String ATOM = VALID_CHARS + '+';
108:
109: /**
110: * This expression derived/taken from the BNF for URI (RFC2396).
111: */
112: private static final String URL_PATTERN = "/^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?/";
113: // 12 3 4 5 6 7 8 9
114:
115: /**
116: * Schema/Protocol (ie. http:, ftp:, file:, etc).
117: */
118: private static final int PARSE_URL_SCHEME = 2;
119:
120: /**
121: * Includes hostname/ip and port number.
122: */
123: private static final int PARSE_URL_AUTHORITY = 4;
124:
125: private static final int PARSE_URL_PATH = 5;
126:
127: private static final int PARSE_URL_QUERY = 7;
128:
129: private static final int PARSE_URL_FRAGMENT = 9;
130:
131: /**
132: * Protocol (ie. http:, ftp:,https:).
133: */
134: private static final String SCHEME_PATTERN = "/^[" + SCHEME_CHARS
135: + "]/";
136:
137: private static final String AUTHORITY_PATTERN = "/^(["
138: + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?/";
139: // 1 2 3 4
140:
141: private static final int PARSE_AUTHORITY_HOST_IP = 1;
142:
143: private static final int PARSE_AUTHORITY_PORT = 2;
144:
145: /**
146: * Should always be empty.
147: */
148: private static final int PARSE_AUTHORITY_EXTRA = 3;
149:
150: private static final String PATH_PATTERN = "/^(/[-\\w:@&?=+,.!/~*'%$_;]*)?$/";
151:
152: private static final String QUERY_PATTERN = "/^(.*)$/";
153:
154: private static final String LEGAL_ASCII_PATTERN = "/^[\\000-\\177]+$/";
155:
156: private static final String IP_V4_DOMAIN_PATTERN = "/^(\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})$/";
157:
158: private static final String DOMAIN_PATTERN = "/^" + ATOM + "(\\."
159: + ATOM + ")*$/";
160:
161: private static final String PORT_PATTERN = "/^:(\\d{1,5})$/";
162:
163: private static final String ATOM_PATTERN = "/(" + ATOM + ")/";
164:
165: private static final String ALPHA_PATTERN = "/^[" + ALPHA_CHARS
166: + "]/";
167:
168: /**
169: * Holds the set of current validation options.
170: */
171: private Flags options = null;
172:
173: /**
174: * The set of schemes that are allowed to be in a URL.
175: */
176: private Set allowedSchemes = new HashSet();
177:
178: /**
179: * If no schemes are provided, default to this set.
180: */
181: protected String[] defaultSchemes = { "http", "https", "ftp" };
182:
183: /**
184: * Create a UrlValidator with default properties.
185: */
186: public UrlValidator() {
187: this (null);
188: }
189:
190: /**
191: * Behavior of validation is modified by passing in several strings options:
192: * @param schemes Pass in one or more url schemes to consider valid, passing in
193: * a null will default to "http,https,ftp" being valid.
194: * If a non-null schemes is specified then all valid schemes must
195: * be specified. Setting the ALLOW_ALL_SCHEMES option will
196: * ignore the contents of schemes.
197: */
198: public UrlValidator(String[] schemes) {
199: this (schemes, 0);
200: }
201:
202: /**
203: * Initialize a UrlValidator with the given validation options.
204: * @param options The options should be set using the public constants declared in
205: * this class. To set multiple options you simply add them together. For example,
206: * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
207: */
208: public UrlValidator(int options) {
209: this (null, options);
210: }
211:
212: /**
213: * Behavour of validation is modified by passing in options:
214: * @param schemes The set of valid schemes.
215: * @param options The options should be set using the public constants declared in
216: * this class. To set multiple options you simply add them together. For example,
217: * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
218: */
219: public UrlValidator(String[] schemes, int options) {
220: this .options = new Flags(options);
221:
222: if (this .options.isOn(ALLOW_ALL_SCHEMES)) {
223: return;
224: }
225:
226: if (schemes == null) {
227: schemes = this .defaultSchemes;
228: }
229:
230: this .allowedSchemes.addAll(Arrays.asList(schemes));
231: }
232:
233: /**
234: * <p>Checks if a field has a valid url address.</p>
235: *
236: * @param value The value validation is being performed on. A <code>null</code>
237: * value is considered invalid.
238: * @return true if the url is valid.
239: */
240: public boolean isValid(String value) {
241: if (value == null) {
242: return false;
243: }
244:
245: Perl5Util matchUrlPat = new Perl5Util();
246: Perl5Util matchAsciiPat = new Perl5Util();
247:
248: if (!matchAsciiPat.match(LEGAL_ASCII_PATTERN, value)) {
249: return false;
250: }
251:
252: // Check the whole url address structure
253: if (!matchUrlPat.match(URL_PATTERN, value)) {
254: return false;
255: }
256:
257: if (!isValidScheme(matchUrlPat.group(PARSE_URL_SCHEME))) {
258: return false;
259: }
260:
261: if (!isValidAuthority(matchUrlPat.group(PARSE_URL_AUTHORITY))) {
262: return false;
263: }
264:
265: if (!isValidPath(matchUrlPat.group(PARSE_URL_PATH))) {
266: return false;
267: }
268:
269: if (!isValidQuery(matchUrlPat.group(PARSE_URL_QUERY))) {
270: return false;
271: }
272:
273: if (!isValidFragment(matchUrlPat.group(PARSE_URL_FRAGMENT))) {
274: return false;
275: }
276:
277: return true;
278: }
279:
280: /**
281: * Validate scheme. If schemes[] was initialized to a non null,
282: * then only those scheme's are allowed. Note this is slightly different
283: * than for the constructor.
284: * @param scheme The scheme to validate. A <code>null</code> value is considered
285: * invalid.
286: * @return true if valid.
287: */
288: protected boolean isValidScheme(String scheme) {
289: if (scheme == null) {
290: return false;
291: }
292:
293: Perl5Util schemeMatcher = new Perl5Util();
294: if (!schemeMatcher.match(SCHEME_PATTERN, scheme)) {
295: return false;
296: }
297:
298: if (this .options.isOff(ALLOW_ALL_SCHEMES)) {
299:
300: if (!this .allowedSchemes.contains(scheme)) {
301: return false;
302: }
303: }
304:
305: return true;
306: }
307:
308: /**
309: * Returns true if the authority is properly formatted. An authority is the combination
310: * of hostname and port. A <code>null</code> authority value is considered invalid.
311: * @param authority Authority value to validate.
312: * @return true if authority (hostname and port) is valid.
313: */
314: protected boolean isValidAuthority(String authority) {
315: if (authority == null) {
316: return false;
317: }
318:
319: Perl5Util authorityMatcher = new Perl5Util();
320: Perl5Util matchIPV4Pat = new Perl5Util();
321:
322: if (!authorityMatcher.match(AUTHORITY_PATTERN, authority)) {
323: return false;
324: }
325:
326: boolean ipV4Address = false;
327: boolean hostname = false;
328: // check if authority is IP address or hostname
329: String hostIP = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
330: ipV4Address = matchIPV4Pat.match(IP_V4_DOMAIN_PATTERN, hostIP);
331:
332: if (ipV4Address) {
333: // this is an IP address so check components
334: for (int i = 1; i <= 4; i++) {
335: String ipSegment = matchIPV4Pat.group(i);
336: if (ipSegment == null || ipSegment.length() <= 0) {
337: return false;
338: }
339:
340: try {
341: if (Integer.parseInt(ipSegment) > 255) {
342: return false;
343: }
344: } catch (NumberFormatException e) {
345: return false;
346: }
347:
348: }
349: } else {
350: // Domain is hostname name
351: Perl5Util domainMatcher = new Perl5Util();
352: hostname = domainMatcher.match(DOMAIN_PATTERN, hostIP);
353: }
354:
355: //rightmost hostname will never start with a digit.
356: if (hostname) {
357: // LOW-TECH FIX FOR VALIDATOR-202
358: // TODO: Rewrite to use ArrayList and .add semantics: see VALIDATOR-203
359: char[] chars = hostIP.toCharArray();
360: int size = 1;
361: for (int i = 0; i < chars.length; i++) {
362: if (chars[i] == '.') {
363: size++;
364: }
365: }
366: String[] domainSegment = new String[size];
367: boolean match = true;
368: int segmentCount = 0;
369: int segmentLength = 0;
370: Perl5Util atomMatcher = new Perl5Util();
371:
372: while (match) {
373: match = atomMatcher.match(ATOM_PATTERN, hostIP);
374: if (match) {
375: domainSegment[segmentCount] = atomMatcher.group(1);
376: segmentLength = domainSegment[segmentCount]
377: .length() + 1;
378: hostIP = (segmentLength >= hostIP.length()) ? ""
379: : hostIP.substring(segmentLength);
380:
381: segmentCount++;
382: }
383: }
384: String topLevel = domainSegment[segmentCount - 1];
385: if (topLevel.length() < 2 || topLevel.length() > 4) {
386: return false;
387: }
388:
389: // First letter of top level must be a alpha
390: Perl5Util alphaMatcher = new Perl5Util();
391: if (!alphaMatcher.match(ALPHA_PATTERN, topLevel.substring(
392: 0, 1))) {
393: return false;
394: }
395:
396: // Make sure there's a host name preceding the authority.
397: if (segmentCount < 2) {
398: return false;
399: }
400: }
401:
402: if (!hostname && !ipV4Address) {
403: return false;
404: }
405:
406: String port = authorityMatcher.group(PARSE_AUTHORITY_PORT);
407: if (port != null) {
408: Perl5Util portMatcher = new Perl5Util();
409: if (!portMatcher.match(PORT_PATTERN, port)) {
410: return false;
411: }
412: }
413:
414: String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
415: if (!GenericValidator.isBlankOrNull(extra)) {
416: return false;
417: }
418:
419: return true;
420: }
421:
422: /**
423: * Returns true if the path is valid. A <code>null</code> value is considered invalid.
424: * @param path Path value to validate.
425: * @return true if path is valid.
426: */
427: protected boolean isValidPath(String path) {
428: if (path == null) {
429: return false;
430: }
431:
432: Perl5Util pathMatcher = new Perl5Util();
433:
434: if (!pathMatcher.match(PATH_PATTERN, path)) {
435: return false;
436: }
437:
438: int slash2Count = countToken("//", path);
439: if (this .options.isOff(ALLOW_2_SLASHES) && (slash2Count > 0)) {
440: return false;
441: }
442:
443: int slashCount = countToken("/", path);
444: int dot2Count = countToken("..", path);
445: if (dot2Count > 0) {
446: if ((slashCount - slash2Count - 1) <= dot2Count) {
447: return false;
448: }
449: }
450:
451: return true;
452: }
453:
454: /**
455: * Returns true if the query is null or it's a properly formatted query string.
456: * @param query Query value to validate.
457: * @return true if query is valid.
458: */
459: protected boolean isValidQuery(String query) {
460: if (query == null) {
461: return true;
462: }
463:
464: Perl5Util queryMatcher = new Perl5Util();
465: return queryMatcher.match(QUERY_PATTERN, query);
466: }
467:
468: /**
469: * Returns true if the given fragment is null or fragments are allowed.
470: * @param fragment Fragment value to validate.
471: * @return true if fragment is valid.
472: */
473: protected boolean isValidFragment(String fragment) {
474: if (fragment == null) {
475: return true;
476: }
477:
478: return this .options.isOff(NO_FRAGMENTS);
479: }
480:
481: /**
482: * Returns the number of times the token appears in the target.
483: * @param token Token value to be counted.
484: * @param target Target value to count tokens in.
485: * @return the number of tokens.
486: */
487: protected int countToken(String token, String target) {
488: int tokenIndex = 0;
489: int count = 0;
490: while (tokenIndex != -1) {
491: tokenIndex = target.indexOf(token, tokenIndex);
492: if (tokenIndex > -1) {
493: tokenIndex++;
494: count++;
495: }
496: }
497: return count;
498: }
499: }
|