Return the Unicode char which is coded in the bytes at position 0. : Code Unicode « Development Class


    

import java.io.File;

import java.io.FileFilter;

import java.util.ArrayList;

import java.util.List;

import java.util.regex.Pattern;

import java.util.regex.PatternSyntaxException;



/*

 *  Licensed to the Apache Software Foundation (ASF) under one

 *  or more contributor license agreements.  See the NOTICE file

 *  distributed with this work for additional information

 *  regarding copyright ownership.  The ASF licenses this file

 *  to you under the Apache License, Version 2.0 (the

 *  "License"); you may not use this file except in compliance

 *  with the License.  You may obtain a copy of the License at

 *  

 *    http://www.apache.org/licenses/LICENSE-2.0

 *  

 *  Unless required by applicable law or agreed to in writing,

 *  software distributed under the License is distributed on an

 *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

 *  KIND, either express or implied.  See the License for the

 *  specific language governing permissions and limitations

 *  under the License. 

 *  

 */







/**

 * Various string manipulation methods that are more efficient then chaining

 * string operations: all is done in the same buffer without creating a bunch of

 * string objects.

 * 

 * @author <a href="mailto:dev@labs.apache.org">Dungeon Project</a>

 */

public class Main {

  private static final int UTF8_MULTI_BYTES_MASK = 0x0080;



  private static final int UTF8_TWO_BYTES_MASK = 0x00E0;



  private static final int UTF8_TWO_BYTES = 0x00C0;



  private static final int UTF8_THREE_BYTES_MASK = 0x00F0;



  private static final int UTF8_THREE_BYTES = 0x00E0;



  private static final int UTF8_FOUR_BYTES_MASK = 0x00F8;



  private static final int UTF8_FOUR_BYTES = 0x00F0;



  private static final int UTF8_FIVE_BYTES_MASK = 0x00FC;



  private static final int UTF8_FIVE_BYTES = 0x00F8;



  private static final int UTF8_SIX_BYTES_MASK = 0x00FE;



  private static final int UTF8_SIX_BYTES = 0x00FC;

  /**

   * Return the Unicode char which is coded in the bytes at position 0.

   * 

   * @param bytes

   *            The byte[] represntation of an Unicode string.

   * @return The first char found.

   */

  public static final char bytesToChar( byte[] bytes )

  {

      return bytesToChar( bytes, 0 );

  }



  /**

   * Return the Unicode char which is coded in the bytes at the given

   * position.

   * 

   * @param bytes

   *            The byte[] represntation of an Unicode string.

   * @param pos

   *            The current position to start decoding the char

   * @return The decoded char, or -1 if no char can be decoded TODO : Should

   *         stop after the third byte, as a char is only 2 bytes long.

   */

  public static final char bytesToChar( byte[] bytes, int pos )

  {

      if ( bytes == null )

      {

          return ( char ) -1;

      }



      if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 )

      {

          return ( char ) bytes[pos];

      }

      else

      {

          if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES )

          {

              // Two bytes char

              return ( char ) ( ( ( bytes[pos] & 0x1C ) << 6 ) + // 110x-xxyy

                                                                  // 10zz-zzzz

                                                                  // ->

                                                                  // 0000-0xxx

                                                                  // 0000-0000

                  ( ( bytes[pos] & 0x03 ) << 6 ) + // 110x-xxyy 10zz-zzzz

                                                      // -> 0000-0000

                                                      // yy00-0000

              ( bytes[pos + 1] & 0x3F ) // 110x-xxyy 10zz-zzzz -> 0000-0000

                                          // 00zz-zzzz

              ); // -> 0000-0xxx yyzz-zzzz (07FF)

          }

          else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES )

          {

              // Three bytes char

              return ( char ) (

              // 1110-tttt 10xx-xxyy 10zz-zzzz -> tttt-0000-0000-0000

              ( ( bytes[pos] & 0x0F ) << 12 ) +

              // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-xxxx-0000-0000

                  ( ( bytes[pos + 1] & 0x3C ) << 6 ) +

                  // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-yy00-0000

                  ( ( bytes[pos + 1] & 0x03 ) << 6 ) +

              // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-00zz-zzzz

              ( bytes[pos + 2] & 0x3F )

              // -> tttt-xxxx yyzz-zzzz (FF FF)

              );

          }

          else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES )

          {

              // Four bytes char

              return ( char ) (

              // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 000t-tt00

              // 0000-0000 0000-0000

              ( ( bytes[pos] & 0x07 ) << 18 ) +

              // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-00uu

              // 0000-0000 0000-0000

                  ( ( bytes[pos + 1] & 0x30 ) << 16 ) +

                  // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000

                  // vvvv-0000 0000-0000

                  ( ( bytes[pos + 1] & 0x0F ) << 12 ) +

                  // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000

                  // 0000-xxxx 0000-0000

                  ( ( bytes[pos + 2] & 0x3C ) << 6 ) +

                  // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000

                  // 0000-0000 yy00-0000

                  ( ( bytes[pos + 2] & 0x03 ) << 6 ) +

              // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000

              // 0000-0000 00zz-zzzz

              ( bytes[pos + 3] & 0x3F )

              // -> 000t-ttuu vvvv-xxxx yyzz-zzzz (1FFFFF)

              );

          }

          else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )

          {

              // Five bytes char

              return ( char ) (

              // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->

              // 0000-00tt 0000-0000 0000-0000 0000-0000

              ( ( bytes[pos] & 0x03 ) << 24 ) +

              // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->

              // 0000-0000 uuuu-uu00 0000-0000 0000-0000

                  ( ( bytes[pos + 1] & 0x3F ) << 18 ) +

                  // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->

                  // 0000-0000 0000-00vv 0000-0000 0000-0000

                  ( ( bytes[pos + 2] & 0x30 ) << 12 ) +

                  // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->

                  // 0000-0000 0000-0000 wwww-0000 0000-0000

                  ( ( bytes[pos + 2] & 0x0F ) << 12 ) +

                  // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->

                  // 0000-0000 0000-0000 0000-xxxx 0000-0000

                  ( ( bytes[pos + 3] & 0x3C ) << 6 ) +

                  // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->

                  // 0000-0000 0000-0000 0000-0000 yy00-0000

                  ( ( bytes[pos + 3] & 0x03 ) << 6 ) +

              // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->

              // 0000-0000 0000-0000 0000-0000 00zz-zzzz

              ( bytes[pos + 4] & 0x3F )

              // -> 0000-00tt uuuu-uuvv wwww-xxxx yyzz-zzzz (03 FF FF FF)

              );

          }

          else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )

          {

              // Six bytes char

              return ( char ) (

              // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz

              // ->

              // 0s00-0000 0000-0000 0000-0000 0000-0000

              ( ( bytes[pos] & 0x01 ) << 30 ) +

              // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz

              // ->

                  // 00tt-tttt 0000-0000 0000-0000 0000-0000

                  ( ( bytes[pos + 1] & 0x3F ) << 24 ) +

                  // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy

                  // 10zz-zzzz ->

                  // 0000-0000 uuuu-uu00 0000-0000 0000-0000

                  ( ( bytes[pos + 2] & 0x3F ) << 18 ) +

                  // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy

                  // 10zz-zzzz ->

                  // 0000-0000 0000-00vv 0000-0000 0000-0000

                  ( ( bytes[pos + 3] & 0x30 ) << 12 ) +

                  // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy

                  // 10zz-zzzz ->

                  // 0000-0000 0000-0000 wwww-0000 0000-0000

                  ( ( bytes[pos + 3] & 0x0F ) << 12 ) +

                  // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy

                  // 10zz-zzzz ->

                  // 0000-0000 0000-0000 0000-xxxx 0000-0000

                  ( ( bytes[pos + 4] & 0x3C ) << 6 ) +

                  // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy

                  // 10zz-zzzz ->

                  // 0000-0000 0000-0000 0000-0000 yy00-0000

                  ( ( bytes[pos + 4] & 0x03 ) << 6 ) +

              // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz

              // ->

              // 0000-0000 0000-0000 0000-0000 00zz-zzzz

              ( bytes[pos + 5] & 0x3F )

              // -> 0stt-tttt uuuu-uuvv wwww-xxxx yyzz-zzzz (7F FF FF FF)

              );

          }

          else

          {

              return ( char ) -1;

          }

      }

  }

}
Return the Unicode char which is coded in the bytes at position 0. : Code Unicode « Development Class « Java