001: /*
002:
003: Derby - Class org.apache.derby.impl.load.ImportReadData
004:
005: Licensed to the Apache Software Foundation (ASF) under one or more
006: contributor license agreements. See the NOTICE file distributed with
007: this work for additional information regarding copyright ownership.
008: The ASF licenses this file to You under the Apache License, Version 2.0
009: (the "License"); you may not use this file except in compliance with
010: the License. You may obtain a copy of the License at
011:
012: http://www.apache.org/licenses/LICENSE-2.0
013:
014: Unless required by applicable law or agreed to in writing, software
015: distributed under the License is distributed on an "AS IS" BASIS,
016: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017: See the License for the specific language governing permissions and
018: limitations under the License.
019:
020: */
021:
022: package org.apache.derby.impl.load;
023:
024: import java.io.BufferedReader;
025: import java.io.FileNotFoundException;
026: import java.io.InputStream;
027: import java.io.InputStreamReader;
028: import java.io.FileInputStream;
029: import java.io.IOException;
030: import java.net.MalformedURLException;
031: import java.net.URL;
032: import org.apache.derby.iapi.services.sanity.SanityManager;
033:
034: final class ImportReadData implements
035: java.security.PrivilegedExceptionAction {
036: //Read data from this file
037: private String inputFileName;
038:
039: private int[] columnWidths;
040: private int rowWidth;
041: private char[] tempString;
042: private int numberOfCharsReadSoFar;
043:
044: //temporary variables
045: private BufferedReader bufferedReader;
046:
047: //temporary variable which holds each token as we are building it.
048: private static final int START_SIZE = 10240;
049: private char[] currentToken = new char[START_SIZE];
050: private int currentTokenMaxSize = START_SIZE;
051:
052: //This tells whether to look for a matching stop pattern
053: boolean foundStartDelimiter;
054: int totalCharsSoFar;
055: //following is used to ignore whitespaces in the front
056: int positionOfNonWhiteSpaceCharInFront;
057: //following is used to ignore whitespaces in the back
058: int positionOfNonWhiteSpaceCharInBack;
059: int lineNumber;
060: int fieldStartDelimiterIndex;
061: int fieldStopDelimiterIndex;
062: int stopDelimiterPosition;
063: boolean foundStartAndStopDelimiters;
064:
065: //in the constructor we open the stream only if it's delimited file to find out
066: //number of columns. In case of fixed, we know that already from the control file.
067: //then we close the stream. Now the stream is reopened when the first record is
068: //read from the file(ie when the first time next is issued. This was done for the
069: //bug 1032 filed by Dan
070: boolean streamOpenForReading;
071:
072: static final int DEFAULT_FORMAT_CODE = 0;
073: static final int ASCII_FIXED_FORMAT_CODE = 1;
074: private int formatCode = DEFAULT_FORMAT_CODE;
075: private boolean hasColumnDefinition;
076: private char recordSeparatorChar0;
077: private char fieldSeparatorChar0;
078: private boolean recordSepStartNotWhite = true;
079: private boolean fieldSepStartNotWhite = true;
080:
081: //get properties infr from following
082: protected ControlInfo controlFileReader;
083:
084: //Read first row to find out how many columns make up a row and put it in
085: //the following variable
086: protected int numberOfColumns;
087:
088: // the types of the columns that we are about to read
089: protected String[] columnTypes;
090:
091: //Read control file properties and write it in here
092: protected char[] fieldSeparator;
093: protected int fieldSeparatorLength;
094: protected char[] recordSeparator;
095: protected int recordSeparatorLength;
096: protected String nullString;
097: protected String columnDefinition;
098: protected String format;
099: protected String dataCodeset;
100: protected char[] fieldStartDelimiter;
101: protected int fieldStartDelimiterLength;
102: protected char[] fieldStopDelimiter;
103: protected int fieldStopDelimiterLength;
104: protected boolean hasDelimiterAtEnd;
105:
106: //load the control file properties info locally, since we need to refer to them
107: //all the time while looking for tokens
108: private void loadPropertiesInfo() throws Exception {
109: fieldSeparator = controlFileReader.getFieldSeparator()
110: .toCharArray();
111: fieldSeparatorLength = fieldSeparator.length;
112: recordSeparator = controlFileReader.getRecordSeparator()
113: .toCharArray();
114: recordSeparatorLength = recordSeparator.length;
115: nullString = controlFileReader.getNullString();
116: columnDefinition = controlFileReader.getColumnDefinition();
117: format = controlFileReader.getFormat();
118: dataCodeset = controlFileReader.getDataCodeset();
119: fieldStartDelimiter = controlFileReader
120: .getFieldStartDelimiter().toCharArray();
121: fieldStartDelimiterLength = fieldStartDelimiter.length;
122: fieldStopDelimiter = controlFileReader.getFieldEndDelimiter()
123: .toCharArray();
124: fieldStopDelimiterLength = fieldStopDelimiter.length;
125: hasDelimiterAtEnd = controlFileReader.getHasDelimiterAtEnd();
126:
127: // when record or field separators start with typical white space,
128: // we can't ignore it around values in the import file. So set up
129: // a boolean so we don't keep re-testing for it.
130: if (recordSeparatorLength > 0) {
131: recordSeparatorChar0 = recordSeparator[0];
132: recordSepStartNotWhite = (Character
133: .isWhitespace(recordSeparatorChar0) == false);
134: }
135: if (fieldSeparatorLength > 0) {
136: fieldSeparatorChar0 = fieldSeparator[0];
137: fieldSepStartNotWhite = (Character
138: .isWhitespace(fieldSeparatorChar0) == false);
139: }
140: }
141:
142: //inputFileName: File to read data from
143: //controlFileReader: File used to interpret data in the inputFileName
144: ImportReadData(String inputFileName, ControlInfo controlFileReader)
145: throws Exception {
146: this .inputFileName = inputFileName;
147: this .controlFileReader = controlFileReader;
148:
149: //load the control file properties info locally, since we need to refer to
150: //them all the time while looking for tokens
151: loadPropertiesInfo();
152: //read the first row to find how many columns make a row and then save that
153: //column information for further use
154: loadMetaData();
155: }
156:
157: //just a getter returning number of columns for a row in the data file
158: int getNumberOfColumns() {
159: return numberOfColumns;
160: }
161:
162: /**if columndefinition is true, ignore first row. The way to do that is to just
163: * look for the record separator
164: * @exception Exception if there is an error
165: */
166: protected void ignoreFirstRow() throws Exception {
167: readNextToken(recordSeparator, 0, recordSeparatorLength, true);
168: }
169:
170: /** load the column types from the meta data line to be analyzed
171: * later in the constructor of the ImportResultSetMetaData.
172: */
173: protected void loadColumnTypes() throws Exception {
174: int idx;
175: String[] metaDataArray;
176:
177: // start by counting the number of columns that we have at the
178: // meta data line
179: findNumberOfColumnsInARow();
180:
181: // reopen the file to the start of the file to read the actual column types data
182: closeStream();
183: openFile();
184:
185: // make room for the meta data
186: metaDataArray = new String[numberOfColumns];
187:
188: // read the meta data line line - meta data is always in a delimited format
189: readNextDelimitedRow(metaDataArray);
190:
191: // allocate space for the columnTypes meta data
192: // since the meta data line contains a combination of column name and
193: // column type for every column we actually have only half the number of
194: // columns that was counted.
195: columnTypes = new String[numberOfColumns / 2];
196:
197: for (idx = 0; idx < numberOfColumns; idx = idx + 2) {
198: columnTypes[idx / 2] = metaDataArray[idx + 1];
199: }
200:
201: // reopen to the start of the file so the rest of the program will
202: // work as expected
203: closeStream();
204: openFile();
205:
206: // init the numberOfColumns variable since it is
207: // being accumulate by the findNumberOfColumnsInARow method
208: numberOfColumns = 0;
209: }
210:
211: private void openFile() throws Exception {
212: try {
213: java.security.AccessController.doPrivileged(this );
214: } catch (java.security.PrivilegedActionException pae) {
215: throw pae.getException();
216: }
217: }
218:
219: public final Object run() throws Exception {
220: realOpenFile();
221: return null;
222: }
223:
224: //open the input data file for reading
225: private void realOpenFile() throws Exception {
226: InputStream inputStream;
227: try {
228: try {
229: URL url = new URL(inputFileName);
230: if (url.getProtocol().equals("file")) { //this means it's a file url
231: inputFileName = url.getFile(); //seems like you can't do openstream on file
232: throw new MalformedURLException(); //so, get the filename from url and do it ususal way
233: }
234: inputStream = url.openStream();
235: } catch (MalformedURLException ex) {
236: inputStream = new FileInputStream(inputFileName);
237:
238: }
239: } catch (FileNotFoundException ex) {
240: throw LoadError.dataFileNotFound(inputFileName);
241: } catch (SecurityException se) {
242: java.sql.SQLException sqle = LoadError
243: .dataFileNotFound(inputFileName);
244:
245: sqle.setNextException(new java.sql.SQLException("XJ001", se
246: .getMessage(), 0));
247:
248: throw sqle;
249: }
250: java.io.Reader rd = dataCodeset == null ? new InputStreamReader(
251: inputStream)
252: : new InputStreamReader(inputStream, dataCodeset);
253: bufferedReader = new BufferedReader(rd, 32 * 1024);
254: streamOpenForReading = true;
255: }
256:
257: //read the first data row to find how many columns make a row and then save that
258: //column information for future use
259: private void loadMetaData() throws Exception {
260: //open the input data file for reading the metadata information
261: openFile();
262: // if column definition is true, ignore the first row since that's not
263: // really the data do uppercase because the ui shows the values as True
264: // and False
265: if (columnDefinition.toUpperCase(java.util.Locale.ENGLISH)
266: .equals(
267: ControlInfo.INTERNAL_TRUE
268: .toUpperCase(java.util.Locale.ENGLISH))) {
269: hasColumnDefinition = true;
270: ignoreFirstRow();
271: }
272:
273: if (formatCode == DEFAULT_FORMAT_CODE) {
274: findNumberOfColumnsInARow();
275: }
276: closeStream();
277: }
278:
279: /**close the input data file
280: * @exception Exception if there is an error
281: */
282: void closeStream() throws Exception {
283: if (streamOpenForReading) {
284: bufferedReader.close();
285: streamOpenForReading = false;
286: }
287: }
288:
289: //actually looks at the data file to find how many columns make up a row
290: int findNumberOfColumnsInARow() throws Exception {
291: // init the number of columns to 1 - no such thing as a table
292: // without columns
293: numberOfColumns = 1;
294: while (!readTokensUntilEndOfRecord()) {
295: numberOfColumns++;
296: }
297: //--numberOfColumns;
298: //what shall we do if there is delimeter after the last column?
299: //reducing the number of columns seems to work fine.
300:
301: //this is necessary to be able to read delimited files that have a delimeter
302: //at the end of a row.
303: if (hasDelimiterAtEnd) {
304: --numberOfColumns;
305: }
306:
307: // a special check - if the imported file is empty then
308: // set the number of columns to 0
309: if (numberOfCharsReadSoFar == 0) {
310: numberOfColumns = 0;
311: }
312: return numberOfColumns;
313: }
314:
315: //keep track of white spaces in the front. We use positionOfNonWhiteSpaceCharInFront for
316: //that. It has the count of number of white spaces found so far before any non-white char
317: //in the token.
318: //Look for whitespace only if field start delimiter is not found yet. Any white spaces
319: //within the start and stop delimiters are ignored.
320: //Also if one of the white space chars is same as recordSeparator or fieldSeparator then
321: //disregard it.
322: private void checkForWhiteSpaceInFront() {
323: //if found white space characters so far, the following if will be true
324: if ((positionOfNonWhiteSpaceCharInFront + 1) == totalCharsSoFar
325: && ((!foundStartDelimiter) && (!foundStartAndStopDelimiters))) {
326: char currentChar = currentToken[positionOfNonWhiteSpaceCharInFront];
327: if (//currentChar == '\t' ||
328: //currentChar == '\r' || alc: why isn't this included?
329: // alc: BTW, \r and \n should be replaced
330: // or amended with the first char of line.separator...
331: //currentChar == '\n' ||
332: //currentChar == ' ') {
333: // use String.trim()'s definition of whitespace.
334: // i18n - check for whitespace - avoid doing a hard coded character
335: // check and use the isWhitespace method to cover all the Unicode
336: // options
337: Character.isWhitespace(currentChar) == true) {
338:
339: if ((recordSepStartNotWhite || (currentChar != recordSeparatorChar0))
340: && (fieldSepStartNotWhite || (currentChar != fieldSeparatorChar0)))
341: //disregard if whitespace char is same as separator first char
342: positionOfNonWhiteSpaceCharInFront++;
343: }
344: }
345: }
346:
347: //look for white spaces from the back towards the stop delimiter position.
348: //If there was no startdelimite & stopdelimiter combination, then we start from the back
349: //all the way to the beginning and stop when we find non-white char
350: //positionOfNonWhiteSpaceCharInBack keeps the count of whitespaces at the back
351: private void checkForWhiteSpaceInBack() {
352: boolean onlyWhiteSpaceSoFar = true;
353: positionOfNonWhiteSpaceCharInBack = 0;
354:
355: for (int i = totalCharsSoFar; (i > stopDelimiterPosition)
356: && onlyWhiteSpaceSoFar; i--) {
357: char currentChar = currentToken[i];
358: // replace test on \t,\n,' ' with String.trim's definition of white space
359: // i18n - check for whitespace - avoid doing a hard coded character
360: // check and use the isWhitespace method to cover all the Unicode
361: // options
362: if (Character.isWhitespace(currentChar) == true) {
363:
364: if ((recordSepStartNotWhite || (currentChar != recordSeparatorChar0))
365: && (fieldSepStartNotWhite || (currentChar != fieldSeparatorChar0)))
366: //disregard if whitespace char is same as separator first char
367: positionOfNonWhiteSpaceCharInBack++;
368: } else
369: onlyWhiteSpaceSoFar = false;
370: }
371: }
372:
373: //keep looking for field and record separators simultaneously because we don't yet
374: //know how many columns make up a row in this data file. Stop as soon as we get
375: //the record separator which is indicated by a return value of true from this function
376: boolean readTokensUntilEndOfRecord() throws Exception {
377: int nextChar;
378: int fieldSeparatorIndex = 0;
379: int recordSeparatorIndex = 0;
380:
381: fieldStopDelimiterIndex = 0;
382: fieldStartDelimiterIndex = 0;
383: totalCharsSoFar = 0;
384: //at the start of every new token, make white space in front count 0
385: positionOfNonWhiteSpaceCharInFront = 0;
386: foundStartDelimiter = false;
387: foundStartAndStopDelimiters = false;
388: numberOfCharsReadSoFar = 0;
389:
390: while (true) {
391: nextChar = bufferedReader.read();
392: if (nextChar == -1)
393: return true;
394: numberOfCharsReadSoFar++;
395: //read the character into the token holder. If token holder reaches it's capacity,
396: //double it's capacity
397: currentToken[totalCharsSoFar++] = (char) nextChar;
398: //check if character read is white space char in front
399: checkForWhiteSpaceInFront();
400: if (totalCharsSoFar == currentTokenMaxSize) {
401: currentTokenMaxSize = currentTokenMaxSize * 2;
402: char[] tempArray = new char[currentTokenMaxSize];
403: System.arraycopy(currentToken, 0, tempArray, 0,
404: totalCharsSoFar);
405: currentToken = tempArray;
406: }
407:
408: //see if we can find fieldSeparator
409: fieldSeparatorIndex = lookForPassedSeparator(
410: fieldSeparator, fieldSeparatorIndex,
411: fieldSeparatorLength, nextChar, false);
412: //every time we find a column separator, the return false will indicate that count
413: //this token as column data value and keep lookin for more tokens or record
414: //separator
415: if (fieldSeparatorIndex == -1)
416: return false;
417:
418: //if found start delimiter, then don't look for record separator, just look for
419: //end delimiter
420: if (!foundStartDelimiter) {
421: //see if we can find recordSeparator
422: recordSeparatorIndex = lookForPassedSeparator(
423: recordSeparator, recordSeparatorIndex,
424: recordSeparatorLength, nextChar, true);
425: if (recordSeparatorIndex == -1)
426: return true;
427: }
428: }
429: }
430:
431: //if not inside a start delimiter, then look for the delimiter passed
432: //else look for stop delimiter first.
433: //this routine returns -1 if it finds field delimiter or record delimiter
434: private int lookForPassedSeparator(char[] delimiter,
435: int delimiterIndex, int delimiterLength, int nextChar,
436: boolean lookForRecordSeperator) throws IOException {
437:
438: //foundStartDelimiter will be false if we haven't found a start delimiter yet
439: //if we haven't found startdelimiter, then we look for both start delimiter
440: //and passed delimiter(which can be field or record delimiter). If we do find
441: //start delimiter, then we only look for stop delimiter and not the passed delimiter.
442: if (!foundStartDelimiter) {
443: //look for start delimiter only if it's length is non-zero and only if haven't already
444: //found it at all so far.
445: if (fieldStartDelimiterLength != 0
446: && (!foundStartAndStopDelimiters)) {
447: //the code inside following if will be executed only if we have gone past all the
448: //white characters in the front.
449: if (totalCharsSoFar != positionOfNonWhiteSpaceCharInFront
450: && (totalCharsSoFar - positionOfNonWhiteSpaceCharInFront) <= fieldStartDelimiterLength) {
451: //After getting rid of white spaces in front, look for the start delimiter. If
452: //found, set foundStartDelimiter flag.
453: if (nextChar == fieldStartDelimiter[fieldStartDelimiterIndex]) {
454: fieldStartDelimiterIndex++;
455: if (fieldStartDelimiterIndex == fieldStartDelimiterLength) {
456: foundStartDelimiter = true;
457: //since characters read so far are same as start delimiters, discard those chars
458: totalCharsSoFar = 0;
459: positionOfNonWhiteSpaceCharInFront = 0;
460: return 0;
461: }
462: } else {
463: //found a mismatch for the start delimiter
464: //see if found match for more than one char of this start delimiter before the
465: //current mismatch, if so check the remaining chars agains
466: //eg if stop delimiter is xa and data is xxa
467: if (fieldStartDelimiterIndex > 0) {
468: reCheckRestOfTheCharacters(totalCharsSoFar
469: - fieldStartDelimiterIndex,
470: fieldStartDelimiter,
471: fieldStartDelimiterLength);
472: }
473: }
474: }
475: }
476:
477: /*look for typical record seperators line feed (\n), a carriage return
478: * (\r) or a carriage return followed by line feed (\r\n)
479: */
480: if (lookForRecordSeperator) {
481: if (nextChar == '\r' || nextChar == '\n') {
482: recordSeparatorChar0 = (char) nextChar;
483: if (nextChar == '\r') {
484: //omot the line feed character if it exists in the stream
485: omitLineFeed();
486: }
487:
488: totalCharsSoFar = totalCharsSoFar - 1;
489: return -1;
490: }
491:
492: return delimiterIndex;
493: }
494:
495: //look for passed delimiter
496: if (nextChar == delimiter[delimiterIndex]) {
497: delimiterIndex++;
498: if (delimiterIndex == delimiterLength) { //found passed delimiter
499: totalCharsSoFar = totalCharsSoFar - delimiterLength;
500: return -1;
501: }
502: return delimiterIndex; //this number of chars of delimiter have exact match so far
503: } else {
504: //found a mismatch for the delimiter
505: //see if found match for more than one char of this delimiter before the
506: //current mismatch, if so check the remaining chars agains
507: //eg if delimiter is xa and data is xxa
508: if (delimiterIndex > 0)
509: return (reCheckRestOfTheCharacters(totalCharsSoFar
510: - delimiterIndex, delimiter,
511: delimiterLength));
512: }
513: } else {
514: //see if we can find fieldStopDelimiter
515: if (nextChar == fieldStopDelimiter[fieldStopDelimiterIndex]) {
516: fieldStopDelimiterIndex++;
517: if (fieldStopDelimiterIndex == fieldStopDelimiterLength) {
518: boolean skipped = skipDoubleDelimiters(fieldStopDelimiter);
519: if (!skipped) {
520: foundStartDelimiter = false;
521: //found stop delimiter, discard the chars corresponding to stop delimiter
522: totalCharsSoFar = totalCharsSoFar
523: - fieldStopDelimiterLength;
524: //following is to take care of a case like "aa"aa This will result in an
525: //error. Also a case like "aa" will truncate it to just aa
526: stopDelimiterPosition = totalCharsSoFar;
527: //following is used to distinguish between empty string ,"", and null string ,,
528: foundStartAndStopDelimiters = true;
529: } else {
530: fieldStopDelimiterIndex = 0;
531: }
532: return 0;
533: }
534: return 0;
535: } else {
536: //found a mismatch for the stop delimiter
537: //see if found match for more than one char of this stop delimiter before the
538: //current mismatch, if so check the remaining chars agains
539: //eg if stop delimiter is xa and data is xxa
540: if (fieldStopDelimiterIndex > 0) {
541: reCheckRestOfTheCharacters(totalCharsSoFar
542: - fieldStopDelimiterIndex,
543: fieldStopDelimiter,
544: fieldStopDelimiterLength);
545: return 0;
546: }
547: }
548: }
549: return 0;
550: }
551:
552: //If after finding a few matching characters for a delimiter, find a mismatch,
553: //restart the matching process from character next to the one from which you
554: //were in the process of finding the matching pattern
555: private int reCheckRestOfTheCharacters(int startFrom,
556: char[] delimiter, int delimiterLength) {
557: int delimiterIndex = 0;
558: // alc: need to test delim of abab with abaabab
559: // if delimIndex resets to 0, i probably needs to reset to
560: // (an ever increasing) startFrom=startFrom+1, not stay where it is
561: for (int i = startFrom; i < totalCharsSoFar; i++) {
562: if (currentToken[i] == delimiter[delimiterIndex])
563: delimiterIndex++;
564: else
565: delimiterIndex = 0;
566: }
567: return delimiterIndex;
568: }
569:
570: /*
571: * skips the duplicate delimeter characters inserd character stringd ata
572: * to get the original string. In Double Delimter recognigation Delimiter
573: * Format strings are written with a duplicate delimeter if a delimiter is
574: * found inside the data while exporting.
575: * For example with double quote(") as character delimiter
576: *
577: * "What a ""nice""day!"
578: *
579: * will be imported as:
580: *
581: * What a "nice"day!
582: *
583: * In the case of export, the rule applies in reverse. For example,
584: *
585: * I am 6"tall.
586: *
587: * will be exported to a file as:
588: *
589: * "I am 6""tall."
590: */
591: private boolean skipDoubleDelimiters(char[] characterDelimiter)
592: throws IOException {
593: boolean skipped = true;
594: int cDelLength = characterDelimiter.length;
595: bufferedReader.mark(cDelLength);
596: for (int i = 0; i < cDelLength; i++) {
597: int nextChar = bufferedReader.read();
598: if (nextChar != characterDelimiter[i]) {
599: //not a double delimter case
600: bufferedReader.reset();
601: skipped = false;
602: break;
603: }
604: }
605: return skipped;
606: }
607:
608: //omit the line feed character(\n)
609: private void omitLineFeed() throws IOException {
610: bufferedReader.mark(1);
611: int nextChar = bufferedReader.read();
612: if (nextChar != '\n') {
613: //not a Line Feed
614: bufferedReader.reset();
615: }
616: }
617:
618: /**returns the number of the current row
619: */
620: int getCurrentRowNumber() {
621: return lineNumber;
622: }
623:
624: /**the way we read the next row from input file depends on it's format
625: * @exception Exception if there is an error
626: */
627: boolean readNextRow(String[] returnStringArray) throws Exception {
628: boolean readVal;
629: int idx;
630:
631: if (!streamOpenForReading) {
632: openFile();
633: //as earlier, ignore the first row if it's colum definition
634: //do uppercase because the ui shows the values as True and False
635: if (hasColumnDefinition) {
636: ignoreFirstRow();
637: }
638: }
639: if (formatCode == DEFAULT_FORMAT_CODE)
640: readVal = readNextDelimitedRow(returnStringArray);
641: else
642: readVal = readNextFixedRow(returnStringArray);
643:
644: return readVal;
645: }
646:
647: // made this a field so it isn't inited for each row, just
648: // set and cleared on the rows that need it (the last row
649: // in a file, typically, so it isn't used much)
650:
651: private boolean haveSep = true;
652:
653: //read the specified column width for each column
654: private boolean readNextFixedRow(String[] returnStringArray)
655: throws Exception {
656: // readLength is how many bytes it has read so far
657: int readLength = 0;
658: int totalLength = 0;
659:
660: // keep reading until rolWidth bytes have been read
661: while ((readLength += bufferedReader.read(tempString,
662: readLength, rowWidth - readLength)) < rowWidth) {
663:
664: if (readLength == totalLength - 1) {// EOF
665: if (readLength == -1) { // no row, EOF
666: return false;
667: } else {
668: // it's only a bad read if insufficient data was
669: // returned; missing the last record separator is ok
670: if (totalLength != rowWidth
671: - recordSeparator.length) {
672: throw LoadError
673: .unexpectedEndOfFile(lineNumber + 1);
674: } else {
675: haveSep = false;
676: break;
677: }
678: }
679: }
680: // else, some thing is read, continue until the whole column is
681: // read
682: totalLength = readLength;
683: }
684:
685: int colStart = 0;
686: for (int i = 0; i < numberOfColumns; i++) {
687: int colWidth = columnWidths[i];
688:
689: if (colWidth == 0) //if column width is 0, return null
690: returnStringArray[i] = null;
691: else {
692: // if found nullstring, return it as null value
693: String checkAgainstNullString = new String(tempString,
694: colStart, colWidth);
695: if (checkAgainstNullString.trim().equals(nullString))
696: returnStringArray[i] = null;
697: else
698: returnStringArray[i] = checkAgainstNullString;
699: colStart += colWidth;
700: }
701: }
702:
703: //if what we read is not recordSeparator, throw an exception
704: if (haveSep) {
705: for (int i = (recordSeparatorLength - 1); i >= 0; i--) {
706: if (tempString[colStart + i] != recordSeparator[i])
707: throw LoadError
708: .recordSeparatorMissing(lineNumber + 1);
709: }
710: } else
711: haveSep = true; // reset for the next time, if any.
712:
713: lineNumber++;
714: return true;
715: }
716:
717: //by this time, we know number of columns that make up a row in this data file
718: //so first look for number of columns-1 field delimites and then look for record
719: //delimiter
720: private boolean readNextDelimitedRow(String[] returnStringArray)
721: throws Exception {
722:
723: int upperLimit = numberOfColumns - 1; //reduce # field accesses
724:
725: //no data in the input file for some reason
726: if (upperLimit < 0)
727: return false;
728:
729: //look for number of columns - 1 field separators
730: for (int i = 0; i < upperLimit; i++) {
731: if (!readNextToken(fieldSeparator, 0, fieldSeparatorLength,
732: false)) {
733: if (i == 0) // still on the first check
734: return false;
735: else
736: throw LoadError.unexpectedEndOfFile(lineNumber + 1);
737: }
738: //following is to take care of a case like "aa"aa This will result in an
739: //error. Also a case like "aa" will truncate it to just aa. valid blank
740: //chars are ' ' '\r' '\t'
741: if (stopDelimiterPosition != 0
742: && ((stopDelimiterPosition) != totalCharsSoFar)) {
743: for (int k = stopDelimiterPosition + 1; k < totalCharsSoFar; k++) {
744: // alc: should change || to && since || case is never true --
745: // currentChar can't be three different things at once.
746: // alc: why no \n? BTW, \r and \n should be replaced
747: // or amended with the first char of line.separator...
748: //char currentChar = currentToken[k];
749: //if (currentChar != ' ' && currentChar != '\r' && currentChar != '\t')
750: // use String.trim()'s definition of whitespace.
751: // i18n - check for whitespace - avoid doing a hard coded
752: // character check and use the isWhitespace method to cover all
753: // the Unicode options
754: if (Character.isWhitespace(currentToken[k]) == false) {
755: throw LoadError.dataAfterStopDelimiter(
756: lineNumber + 1, i + 1);
757: }
758: }
759: totalCharsSoFar = stopDelimiterPosition;
760: }
761: //totalCharsSoFar can become -1 in readNextToken
762: if (totalCharsSoFar != -1) {
763: returnStringArray[i] = new String(currentToken,
764: positionOfNonWhiteSpaceCharInFront,
765: totalCharsSoFar);
766: } else
767: returnStringArray[i] = null;
768: }
769:
770: //look for record separator for the last column's value
771: //if I find endoffile and the it's only one column table, then it's a valid endoffile
772: //case. Otherwise, it's an error case. Without the following check for the return value
773: //of readNextToken, import was going into infinite loop for a table with single column
774: //import. end-of-file was getting ignored without the following if.
775: if (!readNextToken(recordSeparator, 0, recordSeparatorLength,
776: true)) {
777: if (upperLimit == 0)
778: return false;
779: else
780: throw LoadError.unexpectedEndOfFile(lineNumber + 1);
781: }
782: //following is to take care of a case like "aa"aa This will result in an
783: //error. Also a case like "aa" will truncate it to just aa. valid blank
784: //chars are ' ' '\r' '\t'
785: if (stopDelimiterPosition != 0
786: && (stopDelimiterPosition != totalCharsSoFar)) {
787: for (int i = stopDelimiterPosition + 1; i < totalCharsSoFar; i++) {
788: // alc: should change || to && since || case is never true --
789: // currentChar can't be three different things at once.
790: // alc: why no \n? BTW, \r and \n should be replaced
791: // or amended with the first char of line.separator...
792: //char currentChar = currentToken[i];
793: //if (currentChar != ' ' && currentChar != '\r' && currentChar != '\t')
794: // use String.trim()'s definition of whitespace.
795: // i18n - check for whitespace - avoid doing a hard coded character
796: // check and use the isWhitespace method to cover all the Unicode
797: // options
798: if (Character.isWhitespace(currentToken[i]) == false) {
799: throw LoadError.dataAfterStopDelimiter(
800: lineNumber + 1, numberOfColumns);
801: }
802: }
803: totalCharsSoFar = stopDelimiterPosition;
804: }
805:
806: //to be able to read delimited files that have a delimeter at the end,
807: //we have to reduce totalCharsSoFar by one when it is last column.
808: //Otherwise last delimeter becomes part of the data.
809: if (hasDelimiterAtEnd) {
810: if (!(fieldStopDelimiterLength > 0)) { //if there is no field stop delimeter specified,
811: //hopefully fieldStopDelimiterLength will not be >0
812:
813: //there is weird behavior in the code that makes it read the last
814: //delimeter as part of the last column data, so this forces us to
815: //reduce number of read chars only if there is data stop delimeter
816:
817: //Only if it is the last column:
818: //if (fieldStopDelimiter==null){
819: --totalCharsSoFar;
820: //}
821: }
822: }
823:
824: if (totalCharsSoFar != -1) {
825:
826: /* This is a hack to fix a problem: When there is missing data in columns
827: and hasDelimiterAtEnd==true, then the last delimiter was read as the last column data.
828: Hopefully this will tackle that issue by skipping the last column which is in this case
829: just the delimiter.
830: We need to be careful about the case when the last column data itself is
831: actually same as the delimiter.
832: */
833: if (!hasDelimiterAtEnd) {//normal path:
834: returnStringArray[upperLimit] = new String(
835: currentToken,
836: positionOfNonWhiteSpaceCharInFront,
837: totalCharsSoFar);
838: } else if (totalCharsSoFar == fieldSeparatorLength
839: && isFieldSep(currentToken)) {
840: //means hasDelimiterAtEnd==true and all of the above are true
841:
842: String currentStr = new String(currentToken,
843: positionOfNonWhiteSpaceCharInFront,
844: totalCharsSoFar);
845:
846: if (currentToken[totalCharsSoFar + 1] == fieldStopDelimiter[0]) {
847: returnStringArray[upperLimit] = currentStr;
848: } else {
849: returnStringArray[upperLimit] = null;
850: }
851: } else {
852: //means hasDelimiterAtEnd==true and previous case is wrong.
853: if (totalCharsSoFar > 0) {
854: returnStringArray[upperLimit] = new String(
855: currentToken,
856: positionOfNonWhiteSpaceCharInFront,
857: totalCharsSoFar);
858: } else {
859: returnStringArray[upperLimit] = null;
860: }
861: }
862: } else
863: returnStringArray[upperLimit] = null;
864:
865: lineNumber++;
866: return true;
867: }
868:
869: //tells if a char array is field separator:
870: private boolean isFieldSep(char[] chrArray) {
871: for (int i = 0; i < chrArray.length && i < fieldSeparatorLength; i++) {
872: if (chrArray[i] != fieldSeparator[i])
873: return false;
874: }
875: return true;
876: }
877:
878: //read one column's value at a time
879: boolean readNextToken(char[] delimiter, int delimiterIndex,
880: int delimiterLength, boolean isRecordSeperator)
881: throws Exception {
882: int nextChar;
883:
884: fieldStopDelimiterIndex = 0;
885: fieldStartDelimiterIndex = 0;
886: totalCharsSoFar = 0;
887: //at the start of every new token, make white space in front count 0
888: positionOfNonWhiteSpaceCharInFront = 0;
889: stopDelimiterPosition = 0;
890: foundStartAndStopDelimiters = false;
891: foundStartDelimiter = false;
892: int returnValue;
893:
894: while (true) {
895: nextChar = bufferedReader.read();
896: if (nextChar == -1) //end of file
897: return false;
898:
899: //read the character into the token holder. If token holder reaches it's capacity,
900: //double it's capacity
901: currentToken[totalCharsSoFar++] = (char) nextChar;
902: //check if character read is white space char in front
903: checkForWhiteSpaceInFront();
904: if (totalCharsSoFar == currentTokenMaxSize) {
905: currentTokenMaxSize = currentTokenMaxSize * 2;
906: char[] tempArray = new char[currentTokenMaxSize];
907: System.arraycopy(currentToken, 0, tempArray, 0,
908: totalCharsSoFar);
909: currentToken = tempArray;
910: }
911:
912: returnValue = lookForPassedSeparator(delimiter,
913: delimiterIndex, delimiterLength, nextChar,
914: isRecordSeperator);
915: if (returnValue == -1) {
916: //if no stop delimiter found that "" this means null
917: //also if no stop delimiter found then get rid of spaces around the token
918: if (!foundStartAndStopDelimiters) {
919: if (totalCharsSoFar == 0)
920: totalCharsSoFar = -1;
921: else {
922: //get the count of white spaces from back and subtract that and white spaces in
923: //the front from the characters read so far so that we ignore spaces around the
924: //token.
925: checkForWhiteSpaceInBack();
926: totalCharsSoFar = totalCharsSoFar
927: - positionOfNonWhiteSpaceCharInFront
928: - positionOfNonWhiteSpaceCharInBack;
929: }
930: }
931: return true;
932: }
933: delimiterIndex = returnValue;
934: }
935: }
936: }
|