// ============================================================================
// Unicode related functions
// Copyright (c) 2002, Juergen Haible. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
// ============================================================================

unit uUnicoding;

interface

uses SysUtils;

type
   UTF16 = Word;  UTF16Char = WideChar;  UTF16Str = WideString;
   UTF8  = Byte;  UTF8Char  = Char;      UTF8Str  = AnsiString;
   UTF7  = Byte;  UTF7Char  = Char;      UTF7Str  = AnsiString;

   TUTF_ByteOrderMarks = ( bomNative, bomBE, bomLE );

   TUTF16_ByteOrderMark = record case Boolean of
      True : ( c16   : UTF16 );
      False: ( b1, b2: Byte  )
   end;

const
   // Byte Order Mark (U+FEFF)
   UTF16_BOM: array[ TUTF_ByteOrderMarks ] of TUTF16_ByteOrderMark = (
      ( c16:$FEFF ),      // bomNative (=[bomLE] for Delphi on Intel)
      ( b1:$FE; b2:$FF ), // bomBE (Big Endian)
      ( b1:$FF; b2:$FE )  // bomLE (Little Endian)
   );

function UTF16ToUTF8( const In16       : UTF16Str;
                      out   Out8       : UTF8Str;
                      const BOM_Default: TUTF_ByteOrderMarks = bomNative ): Boolean;

function UTF8ToUTF16( const In8        : UTF8Str;
                      out   Out16      : UTF16Str;
                      const BOM_Default: TUTF_ByteOrderMarks = bomNative ): Boolean;

function UTF16ToUTF7( const In16       : UTF16Str;
                      out   Out7       : UTF7Str;
                      const SafeHeaders: Boolean = True;
                      const BOM_Default: TUTF_ByteOrderMarks = bomNative ): Boolean;

function UTF7ToUTF16( const In7        : UTF7Str;
                      out   Out16      : UTF16Str;
                      const BOM_Default: TUTF_ByteOrderMarks = bomNative ): Boolean;

function UTF8ToUTF7 ( const In8 : UTF8Str;
                      out   Out7: UTF7Str ): Boolean;

function UTF7ToUTF8 ( const In7 : UTF7Str;
                      out   Out8: UTF8Str ): Boolean;


implementation

type
   UTF32 = LongWord;

const
   UNI_SUR_HIGH_START = UTF32( $D800 );
   UNI_SUR_HIGH_END   = UTF32( $DBFF );
   UNI_SUR_LOW_START  = UTF32( $DC00 );
   UNI_SUR_LOW_END    = UTF32( $DFFF );

const
   ULIMIT16_1BYTE =     $007F; // 0000 0000-0000 007F   0xxxxxxx
   ULIMIT16_2BYTE =     $07FF; // 0000 0080-0000 07FF   110xxxxx 10xxxxxx
   ULIMIT16_3BYTE =     $FFFF; // 0000 0800-0000 FFFF   1110xxxx 10xxxxxx * 2
   ULIMIT16_4BYTE =   $1FFFFF; // 0001 0000-001F FFFF   11110xxx 10xxxxxx * 3

   UMAX_UTF16     =   $10FFFF;
   UMAX_UTF32     = $7FFFFFFF;

function Surrogate16To32( const ch, cl: UTF16; var c32: UTF32 ): Boolean;
// convert UTF16-surrogate pair to UTF32
begin
   Result := False;

   if ( ch >= UNI_SUR_HIGH_START ) and ( ch <= UNI_SUR_HIGH_END ) then begin
      if ( cl >= UNI_SUR_LOW_START ) and ( cl <= UNI_SUR_LOW_END ) then begin

         c32 := ( UTF32(ch) - UNI_SUR_HIGH_START ) shl 10
              + ( UTF32(cl) - UNI_SUR_LOW_START  ) + $10000;

         if c32 <= UMAX_UTF16 then Result := True;

      end;
   end;
end;

function UTF_MustSwap16( const In16 : UTF16Str;
                         const InLen: Integer;
                         const BOM_Default: TUTF_ByteOrderMarks ): Boolean;
var  c16: UTF16;
begin
   // default for no BOM given
   case BOM_Default of
      bomBE: Result := ( UTF16_BOM[bomBE].c16 <> UTF16_BOM[bomNative].c16 );
      bomLE: Result := ( UTF16_BOM[bomLE].c16 <> UTF16_BOM[bomNative].c16 );
      else   Result := False;
   end;

   // check for BOM in 1st char
   if InLen > 0 then begin
      c16 := UTF16( In16[1] );
      if c16 = UTF16_BOM[bomBE].c16 then
         Result := ( c16 <> UTF16_BOM[bomNative].c16 )
      else if c16 = UTF16_BOM[bomLE].c16 then
         Result := ( c16 <> UTF16_BOM[bomNative].c16 );
   end;
end;

function UTF16ToUTF8( const In16: UTF16Str; out Out8: UTF8Str;
                      const BOM_Default: TUTF_ByteOrderMarks = bomNative ): Boolean;
var  InPtr, InLen: Integer;
     c16: UTF16;
     c32: UTF32;
     MustSwap: Boolean;
begin
   Result := True;

   SetLength( Out8, 0 );
   InLen := length( In16 );
   InPtr := 1;

   MustSwap := UTF_MustSwap16( In16, InLen, BOM_Default );

   while InPtr <= InLen do begin

      c16 := UTF16( In16[InPtr] );
      if MustSwap then c16 := Swap( c16 );

      if c16 <= ULIMIT16_1BYTE then begin
         Out8 := Out8 + chr( c16 );

      end else if c16 <= ULIMIT16_2BYTE then begin
         Out8 := Out8 + chr( $C0 or ( (c16 shr 6) and $3F ) )
                      + chr( $80 or ( (c16      ) and $3F ) );

      end else if ( c16 >= UNI_SUR_HIGH_START ) and
                  ( c16 <= UNI_SUR_HIGH_END   ) then begin // surrogate pair
         inc( InPtr );
         if InPtr <= InLen then begin
            if Surrogate16To32( c16, UTF16( In16[InPtr] ), c32 ) then begin
               Out8 := Out8 + chr( $F0 or ( (c32 shr 18) and $3F ) )
                            + chr( $80 or ( (c32 shr 12) and $3F ) )
                            + chr( $80 or ( (c32 shr  6) and $3F ) )
                            + chr( $80 or ( (c32       ) and $3F ) )
            end else Result := False;
         end else Result := False;

      end else begin // remaining 0800-FFFF
         Out8 := Out8 + chr( $E0 or ( (c16 shr 12) and $3F ) )
                      + chr( $80 or ( (c16 shr  6) and $3F ) )
                      + chr( $80 or ( (c16       ) and $3F ) );
         if ( c16 >= UNI_SUR_LOW_START ) and
            ( c16 <= UNI_SUR_LOW_END ) then Result := False; // orphaned surr.

      end;

      inc( InPtr );
   end;
end;

function UTF8ToUTF16( const In8: UTF8Str; out Out16: UTF16Str;
                      const BOM_Default: TUTF_ByteOrderMarks = bomNative ): Boolean;
var  InPtr, InLen: Integer;
     c8, ca, cb, cc: UTF8;
     c16: UTF16;
     c32: UTF32; 
begin
   Result := True;

   SetLength( Out16, 0 );
   InLen := length( In8 );
   InPtr := 1;

   while InPtr <= InLen do begin

      c8 := UTF8( UTF8Char( In8[InPtr] ) );

      if c8 >= $F8 then begin // 11111xxx
         Result := False; // invalid

      end else if c8 >= $F0 then begin // 11110xxx 10xxxxxx * 3
         if ( InPtr + 3 ) > InLen then begin
            Result := False; // incomplete
            inc( InPtr, 3 ); // skip rest
         end else begin
            ca := UTF8( In8[InPtr+1] );
            cb := UTF8( In8[InPtr+2] );
            cc := UTF8( In8[InPtr+3] );
            inc( InPtr, 3 );
            if ( ca and $C0 ) <> $80 then Result := False;
            if ( cb and $C0 ) <> $80 then Result := False;
            if ( cc and $C0 ) <> $80 then Result := False;

            c32 := ( ( c8 and $07 ) shl 18 )
                or ( ( ca and $3F ) shl 12 )
                or ( ( cb and $3F ) shl 6  )
                or ( ( cc and $3F )        );

            if c32 > UMAX_UTF16 then begin
               Result := False;
            end else begin
               c16 := ( c32 - $10000 ) shr 10   + UNI_SUR_HIGH_START;
               Out16 := Out16 + UTF16Char( c16 );
               c16 := ( c32 - $10000 ) and $3FF + UNI_SUR_LOW_START;
               Out16 := Out16 + UTF16Char( c16 );
            end;
            if c32 <= ULIMIT16_3BYTE then Result := False; // invalid coding
         end;

      end else if c8 >= $E0 then begin // 1110xxxx 10xxxxxx * 2
         if ( InPtr + 2 ) > InLen then begin
            Result := False; // incomplete
            inc( InPtr, 2 ); // skip rest
         end else begin
            ca := UTF8( In8[InPtr+1] );
            cb := UTF8( In8[InPtr+2] );
            inc( InPtr, 2 );
            if ( ca and $C0 ) <> $80 then Result := False;
            if ( cb and $C0 ) <> $80 then Result := False;

            c16 := ( ( c8 and $0F ) shl 12 )
                or ( ( ca and $3F ) shl 6  )
                or ( ( cb and $3F )        );
            if c16 <= ULIMIT16_2BYTE then Result := False; // invalid coding
            Out16 := Out16 + UTF16Char( c16 );
         end;

      end else if c8 >= $C0 then begin // 110xxxxx 10xxxxxx
         if ( InPtr + 1 ) > InLen then begin
            Result := False; // incomplete
            inc( InPtr );    // skip rest
         end else begin
            ca := UTF8( In8[InPtr+1] );
            inc( InPtr );
            if ( ca and $C0 ) <> $80 then Result := False;

            c16 := ( ( c8 and $0F ) shl 6 )
                or ( ( ca and $3F )       );
            if c16 <= ULIMIT16_1BYTE then Result := False; // invalid coding
            Out16 := Out16 + UTF16Char( c16 );
         end;

      end else if c8 >= $80 then begin // 10xxxxxx
         Result := False; // invalid

      end else begin // 0xxxxxxx
         Out16 := Out16 + UTF16Char( c8 );

      end;

      inc( InPtr );
   end;

   InLen := length( Out16 );
   if UTF_MustSwap16( Out16, InLen, BOM_Default ) then begin
      for InPtr:=1 to InLen do
         Out16[InPtr] := UTF16Char( Swap( UTF16( Out16[InPtr] ) ) );
   end;
end;

// ----------------------------------------------------------------------------

const
   Utf7SetD = // (directly encoded characters)
      [ 'A'..'Z', 'a'..'z', '0'..'9',
        '''', '(', ')', ',', '-', '.', '/', ':', '?' ];

   Utf7SetO = // (optional direct characters)
      [ '!', '"', '#', '$', '%', '&', '*', ';', '<', '=',
        '>', '@', '[', ']', '^', '_', '`', '{', '|', '}' ];

   Utf7SetX = // (...may be directly represented...)
      [ ' ', #9, #13, #10 ];

   Utf7SetB: PChar = // (Modified Base 64)
      'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/';

   MBase64MarkInvalid = $FF;

var
   MBase64TableBuilt: Boolean = False;
   MBase64Table: array[#0..#255] of Byte;

function EncodeModifiedB64( const BufTxt; BufLen: Integer ): String;
var  BufPtr, BufEnd: PChar;
     i, Count, Temp: Integer;
begin
   Result := '';

   BufPtr := @BufTxt;
   BufEnd := BufPtr + BufLen;

   while BufPtr<BufEnd do begin
      Temp := 0;
      Count := 0;
      for i:=0 to 2 do begin
         Temp := (Temp shl 8);
         if BufPtr<BufEnd then begin
            Temp := Temp or ord( BufPtr^ );
            inc( BufPtr );
            inc( Count );
         end;
      end;

      Result := Result + Utf7SetB[ (Temp shr 18) and $3F ]
                       + Utf7SetB[ (Temp shr 12) and $3F ];
      if Count>=2 then Result := Result + Utf7SetB[ (Temp shr 6) and $3F ]
                  else {omit padding};
      if Count>=3 then Result := Result + Utf7SetB[ Temp and $3F ]
                  else {omit padding};
   end;
end;

function DecodeModifiedB64( const BufB64; BufLen: Integer; var OK: Boolean ): String;
var  BufPtr, BufEnd: PChar;
     Pattern, Bits : Integer;
     b64: Byte;
begin
   Result := '';
   OK := True;

   try
      if not MBase64TableBuilt then begin
         FillChar( MBase64Table, sizeof(MBase64Table), MBase64MarkInvalid );
         for b64:=0 to 63 do MBase64Table[ Utf7SetB[b64] ] := b64;
         MBase64TableBuilt := True;
      end;

      BufPtr := @BufB64;
      BufEnd := BufPtr + BufLen;

      Pattern := 0;
      Bits    := 0;

      while BufPtr<BufEnd do begin
         b64 := MBase64Table[ BufPtr^ ];
         if b64<>MBase64MarkInvalid then begin
            Pattern := (Pattern shl 6) or b64;
            inc( Bits, 6 );
            if Bits>=8 then begin
               dec( Bits, 8 );
               Result := Result + chr( ( Pattern shr Bits ) and $FF );
            end;
         end else begin
            OK := False;
            break;
         end;
         inc( BufPtr );
      end;

   except
      Result := '';
      OK := False;
   end;
end;

// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

function UTF16ToUTF7( const In16: UTF16Str; out Out7: UTF7Str;
                      const SafeHeaders: Boolean = True;
                      const BOM_Default: TUTF_ByteOrderMarks = bomNative ): Boolean;
var  InPtr, InLen: Integer;
     c16  : UTF16;
     DoBuf: Boolean;
     Buf  : String;
     MustSwap: Boolean;

   procedure AddBuf;
   begin
      Out7 := Out7 + '+' + EncodeModifiedB64( Buf[1], length(Buf) ) + '-';
      DoBuf := False;
      Buf := '';
   end;

begin
   Result := True;

   SetLength( Out7, 0 );
   InLen := length( In16 );
   DoBuf := False;
   Buf := '';

   MustSwap := UTF_MustSwap16( In16, InLen, BOM_Default );

   for InPtr := 1 to InLen do begin

      c16 := UTF16( In16[InPtr] );
      if MustSwap then c16 := Swap( c16 );

      if ( c16 <= $7F ) and ( chr(c16) in Utf7SetD ) then begin
         if DoBuf then AddBuf;
         Out7 := Out7 + chr(c16);

      end else if ( not SafeHeaders ) and
                  ( c16 <= $7F ) and ( chr(c16) in Utf7SetO ) then begin
         if DoBuf then AddBuf;
         Out7 := Out7 + chr(c16);

      end else if ( c16 <= $7F ) and ( chr(c16) in Utf7SetX ) then begin
         if DoBuf then AddBuf;
         Out7 := Out7 + chr(c16);

      end else begin
         DoBuf := True;
         Buf := Buf + chr( (c16 and $FF00) shr 8 ) + chr( c16 and $FF );
      end;

   end;

   if DoBuf then AddBuf;
end;

function UTF7ToUTF16( const In7: UTF7Str; out Out16: UTF16Str;
                      const BOM_Default: TUTF_ByteOrderMarks = bomNative ): Boolean;
var  InPtr, InLen, i: Integer;
     c7 : UTF7;
     c16: UTF16;
     Buf: String;
     ok : Boolean;
begin
   Result := True;

   SetLength( Out16, 0 );
   InLen := length( In7 );
   InPtr := 1;

   while InPtr <= InLen do begin

      c7 := UTF7( In7[InPtr] );
      inc( InPtr );

      if c7 = ord('+') then begin

         Buf := '';
         while InPtr <= InLen do begin
            c7 := UTF7( In7[InPtr] );
            if c7 = ord('-') then begin inc( InPtr ); break end;
            if StrScan( Utf7SetB, chr(c7) ) = nil then break;
            Buf := Buf + chr(c7);
            inc( InPtr );
         end;

         if length( Buf ) = 0 then begin
            if UTF7Char( In7[InPtr-1] ) <> '-' then Result := False;
            c16 := ord('+');
            Out16 := Out16 + UTF16Char( c16 );
         end else begin
            Buf := DecodeModifiedB64( Buf[1], length(Buf), ok );
            if not ok then Result := False;
            if ( length(Buf) mod 2 ) <> 0 then Result := False;
            for i := 1 to length(Buf) div 2 do begin
               c16 := ord( Buf[i*2-1] ) shl 8 or ord( Buf[i*2] );
               Out16 := Out16 + UTF16Char( c16 );
            end;
         end;

      end else begin

         if c7 >= $80 then Result := False;
         c16 := c7;
         Out16 := Out16 + UTF16Char( c16 );

      end;

   end;

   InLen := length( Out16 );
   if UTF_MustSwap16( Out16, InLen, BOM_Default ) then begin
      for InPtr:=1 to InLen do
         Out16[InPtr] := UTF16Char( Swap( UTF16( Out16[InPtr] ) ) );
   end;
end;

// ----------------------------------------------------------------------------

function UTF8ToUTF7( const In8: UTF8Str; out Out7: UTF7Str ): Boolean;
var  Tmp16: UTF16Str;
begin
   Result := UTF8ToUTF16( In8, Tmp16 );
   Result := Result and UTF16ToUTF7( Tmp16, Out7 );
end;

function UTF7ToUTF8( const In7: UTF7Str; out Out8: UTF8Str ): Boolean;
var  Tmp16: UTF16Str;
begin
   Result := UTF7ToUTF16( In7, Tmp16 );
   Result := Result and UTF16ToUTF8( Tmp16, Out8 );
end;

// ----------------------------------------------------------------------------

end.

