View Issue Details

IDProjectCategoryView StatusLast Update
0038206PatchesPatchpublic2020-12-30 11:30
Reporterpowerpcer Assigned ToJuha Manninen  
PrioritynormalSeverityminorReproducibilityN/A
Status resolvedResolutionfixed 
Platformwindows 
Product Version2.0.4 
Summary0038206: asiancodepagefunctions.inc for support EUDC to PUA for Microsoft mapping
Descriptioni just add EUDC to PUA mapping for asian DBCS and make rough testing, it looks good.
hope this could help.
Additional Information
//Modified by powerpcer
function DBCSToUTF8(const s: string; CodeP: integer): string;
... Juha removed a big chunk of code ...
end;
TagsNo tags attached.
Fixed in Revisionr64308
LazTarget-
Widgetset
Attached Files

Activities

Juha Manninen

2020-12-12 09:12

developer   ~0127552

Please provide a patch against Lazarus trunk.

Bart Broersma

2020-12-12 14:19

developer   ~0127558

See: https://wiki.lazarus.freepascal.org/Creating_A_Patch

powerpcer

2020-12-14 09:25

reporter   ~0127603

this modification come from TRUNK
asiancodepagefunctions.inc (11,609 bytes)   
{%MainUnit ../lconvencoding.pp}

{
 *****************************************************************************
  This file is part of LazUtils.

  See the file COPYING.modifiedLGPL.txt, included in this distribution,
  for details about the license.
 *****************************************************************************

  Asian Unicode Functions.
  The clipboard is able to work with the windows and gtk behaviour/features.
}
//Modified by powerpcer
function DBCSToUTF8(const s: string; const ArrayUni, ArrayCP: array of word): string;
const
  cp936unodered:array[0..254] of Uint32=($a2ab,$a2ac,$a2ad,$a2ae,$a2af,$a2b0,$a2e3,$a2e4,$a2ef,$a2f0,$a2fd,$a2fe,$a4f4,$a4f5,$a4f6,$a4f7,$a4f8,$a4f9,$a4fa,$a4fb,$a4fc,$a4fd,$a4fe,$a5f7,$a5f8,$a5f9,$a5fa,$a5fb,$a5fc,$a5fd,$a5fe,$a6b9,$a6ba,$a6bb,$a6bc,$a6bd,$a6be,$a6bf,$a6c0,$a6d9,$a6da,$a6db,$a6dc,$a6dd,$a6de,$a6df,$a6ec,$a6ed,$a6f3,$a6f6,$a6f7,$a6f8,$a6f9,$a6fa,$a6fb,$a6fc,$a6fd,$a6fe,$a7c2,$a7c3,$a7c4,$a7c5,$a7c6,$a7c7,$a7c8,$a7c9,$a7ca,$a7cb,$a7cc,$a7cd,$a7ce,$a7cf,$a7d0,$a7f2,$a7f3,$a7f4,$a7f5,$a7f6,$a7f7,$a7f8,$a7f9,$a7fa,$a7fb,$a7fc,$a7fd,$a7fe,$a896,$a897,$a898,$a899,$a89a,$a89b,$a89c,$a89d,$a89e,$a89f,$a8a0,$a8bc,$a8bf,$a8c1,$a8c2,$a8c3,$a8c4,$a8ea,$a8eb,$a8ec,$a8ed,$a8ee,$a8ef,$a8f0,$a8f1,$a8f2,$a8f3,$a8f4,$a8f5,$a8f6,$a8f7,$a8f8,$a8f9,$a8fa,$a8fb,$a8fc,$a8fd,$a8fe,$a958,$a95b,$a95d,$a95e,$a95f,$a989,$a98a,$a98b,$a98c,$a98d,$a98e,$a98f,$a990,$a991,$a992,$a993,$a994,$a995,$a997,$a998,$a999,$a99a,$a99b,$a99c,$a99d,$a99e,$a99f,$a9a0,$a9a1,$a9a2,$a9a3,$a9f0,$a9f1,$a9f2,$a9f3,$a9f4,$a9f5,$a9f6,$a9f7,$a9f8,$a9f9,$a9fa,$a9fb,$a9fc,$a9fd,$a9fe,$d7fa,$d7fb,$d7fc,$d7fd,$d7fe,$fe50,$fe51,$fe52,$fe53,$fe54,$fe55,$fe56,$fe57,$fe58,$fe59,$fe5a,$fe5b,$fe5c,$fe5d,$fe5e,$fe5f,$fe60,$fe61,$fe62,$fe63,$fe64,$fe65,$fe66,$fe67,$fe68,$fe69,$fe6a,$fe6b,$fe6c,$fe6d,$fe6e,$fe6f,$fe70,$fe71,$fe72,$fe73,$fe74,$fe75,$fe76,$fe77,$fe78,$fe79,$fe7a,$fe7b,$fe7c,$fe7d,$fe7e,$fe80,$fe81,$fe82,$fe83,$fe84,$fe85,$fe86,$fe87,$fe88,$fe89,$fe8a,$fe8b,$fe8c,$fe8d,$fe8e,$fe8f,$fe90,$fe91,$fe92,$fe93,$fe94,$fe95,$fe96,$fe97,$fe98,$fe99,$fe9a,$fe9b,$fe9c,$fe9d,$fe9e,$fe9f,$fea0);
  cp936unoderedstart:Uint32=$e766;
var
  len, l,i: Integer;
  Src, Dest: PChar;
  c: char;
  code,code1: word;
  Hbyte,Lbyte:byte;
begin
  if s = '' then exit('');
  len := length(s);
  SetLength(Result, len * 4);// Asia UTF-8 is at most 4 bytes
  Src  := PChar(s);
  Dest := PChar(Result);
  repeat
    c := Src^;
    Hbyte := byte(Src^);
    Inc(Src);

    if Ord(c) < 128 then
    begin
      if (c=#0) and (Src-PChar(s)>=len) then break;
      Dest^ := c;
      Inc(Dest);
    end
    else
    begin
      code := Byte(c) shl 8;
      c:=Src^;
      Lbyte := byte(Src^);
      if (c=#0) and (Src-PChar(s)>=len) then break;
      code := code + Byte(c);
      code1:=code;
      Inc(Src);

      code := ArrayUni[SearchTable(ArrayCP, code)];
      if code=0 then
      begin
        case CodeP of
          936:
            begin

              if (HByte>=$AA) and (HByte<=$AF) and (LByte>=$A1) and (LByte<=$FE) then
              begin
                   code:= $e000 + (94 * (HByte-$aa)) + (LByte-$a1);
              end
              else
              if (HByte>=$F8) and (HByte<=$FE) and (LByte>=$A1) and (LByte<=$FE) then
              begin
                   code:= $e234 + (94 * (HByte-$f8)) + (LByte-$a1);
              end
              else
              if (HByte>=$A1) and (HByte<=$A7) and (LByte>=$40) and (LByte<=$A0) then
              begin
                   code:= $e4C6 + (97 * (HByte-$A1)) + (LByte-$40);
              end
              else
              begin
                for i:=0 to length(cp936unodered)-1 do
                begin
                  if code1=cp936unodered[i] then
                  begin
                       code:=cp936unoderedstart+i;
                       break;
                  end;
                end;
              end;
            end;
          950:
            begin
                 if (HByte>=$81) and (HByte<=$8d) and (LByte>=$40) and (LByte<=$FE) then
                 begin
                      code:= $eeb8 + (157 * (Hbyte-$81)) ;
                      if (Lbyte<$80) then
                       code:=code + (Lbyte-$40)
                      else
                       code:=code + (Lbyte-$62);
                 end
                 else
                 if (HByte>=$8e) and (HByte<=$a0) and (LByte>=$40) and (LByte<=$FE) then
                 begin
                   code:= $e311 + (157 * (Hbyte-$8e));
                   if (Lbyte<$80) then
                    code:=code + (Lbyte-$40)
                   else
                    code:=code + (Lbyte-$62);
                 end
                 else
                 if (HByte>=$c6) and (HByte<=$c8) and (LByte>=$a1) and (LByte<=$FE) then
                 begin
                   code:= $f672 + (157 * (Hbyte-$c6)) ;
                   if (Lbyte<$80) then
                    code:=code + (Lbyte-$40)
                   else
                    code:=code + (Lbyte-$62);
                 end
                 else
                 if (HByte>=$fa) and (HByte<=$fe) and (LByte>=$40) and (LByte<=$FE) then
                 begin
                   code:= $e000 + (157 * (Hbyte-$fa)) ;
                   if (Lbyte<$80) then
                    code:=code + (Lbyte-$40)
                   else
                    code:=code + (Lbyte-$62);
                 end;
            end;
          949:
            begin
                 if (HByte=$c9) and (LByte>=$a1) and (LByte<=$fe) then
                 begin
                      code:= $e000 + (94 * (HByte-$c9)) + (LByte-$a1) ;
                 end
                 else
                 if (HByte=$fe) and (LByte>=$a1) and (LByte<=$fe) then
                 begin
                      code:= $e05e + (94 * (HByte-$fe)) + (LByte-$a1) ;
                 end
                 else
                 if code1=$ff then
                 begin
                      code:= $f8f7;
                 end;
            end;
          932:
            begin
                 if (HByte>=$f0) and (HByte<=$f9) and (LByte>=$40) and (LByte<=$fc) then
                 begin
                      code:= $e000 + (188 * (HByte-$f0)) + (LByte-$40) ;
                      if LByte>$7f then
                         code:=code-1;
                 end
                 else
                 begin
                   case code1 of
                     $00a0:code:=$f8f0;
                     $00fd:code:=$f8f1;
                     $00fe:code:=$f8f2;
                     $00ff:code:=$f8f3;
                   end;

                 end;
            end
          else
            code := 0;
        end;
      end;
      if code>0 then
      begin
        l:=UnicodeToUTF8Inline(code,Dest);
        inc(Dest,l);
      end
      else
      case ConvertEncodingErrorMode of
        ceemSkip:
          begin end;
        ceemException:
          raise EConvertError.Create('Cannot convert DBCS code page to UTF8');
        ceemReplace:
          begin
            Dest^:='?';
            Inc(Dest);
          end;
        ceemReturnEmpty:
          Exit('');
      end;
    end;
  until false;
  SetLength(Result, {%H-}PtrUInt(Dest) - PtrUInt(Result));
end;

function CP936ToUTF8(const s: string): string;
begin
  Result := DBCSToUTF8(s, Uni936C, CP936CC);
end;

function CP950ToUTF8(const s: string): string;
begin
  Result := DBCSToUTF8(s, Uni950C, CP950CC);
end;

function CP949ToUTF8(const s: string): string;
begin
  Result := DBCSToUTF8(s, Uni949C, CP949CC);
end;

function CP932ToUTF8(const s: string): string;
begin
  Result := DBCSToUTF8(s, Uni932C, CP932CC);
end;

{$IfNDef UseSystemCPConv}
function UnicodeToCP936(Unicode: cardinal): integer;
begin
  case Unicode of
    0..127: Result := Unicode;
    else
      Result := CP936CU[SearchTable(Uni936U, Unicode)];
  end;
end;

function UnicodeToCP950(Unicode: cardinal): integer;
begin
  case Unicode of
    0..127: Result := Unicode;
    else
      Result := CP950CU[SearchTable(Uni950U, Unicode)];
  end;
end;

function UnicodeToCP949(Unicode: cardinal): integer;
begin
  case Unicode of
    0..127: Result := Unicode;
    else
      Result := CP949CU[SearchTable(Uni949U, Unicode)];
  end;
end;

function UnicodeToCP932(Unicode: cardinal): integer;
begin
  case Unicode of
    0..127: Result := Unicode;
    else
      Result := CP932CU[SearchTable(Uni932U, Unicode)];
  end;
end;
{$endif}

{$ifdef FPC_HAS_CPSTRING}
procedure InternalUTF8ToDBCS(const s: string; TargetCodePage: TSystemCodePage;
  SetTargetCodePage: boolean;
  {$IfNDef UseSystemCPConv}const UTF8CharConvFunc: TUnicodeToCharID;{$endif}
  out TheResult: RawByteString); inline;
begin
  {$ifdef UseSystemCPConv}
  TheResult:=s;
  SetCodePage(TheResult, TargetCodePage, True);
  if not SetTargetCodePage then
    SetCodePage(TheResult, CP_ACP, False);
  {$else}
  TheResult:=UTF8ToDBCS(s,UTF8CharConvFunc);
  if SetTargetCodePage then
    SetCodePage(TheResult, TargetCodePage, False);
  {$endif}
end;

function UTF8ToCP932(const s: string; SetTargetCodePage: boolean): RawByteString;
begin
  InternalUTF8ToDBCS(s,932,SetTargetCodePage{$IfNDef UseSystemCPConv},@UnicodeToCP932{$endif},Result);
end;

function UTF8ToCP936(const s: string; SetTargetCodePage: boolean): RawByteString;
begin
  InternalUTF8ToDBCS(s,936,SetTargetCodePage{$IfNDef UseSystemCPConv},@UnicodeToCP936{$endif},Result);
end;

function UTF8ToCP949(const s: string; SetTargetCodePage: boolean): RawByteString;
begin
  InternalUTF8ToDBCS(s,949,SetTargetCodePage{$IfNDef UseSystemCPConv},@UnicodeToCP949{$endif},Result);
end;

function UTF8ToCP950(const s: string; SetTargetCodePage: boolean): RawByteString;
begin
  InternalUTF8ToDBCS(s,950,SetTargetCodePage{$IfNDef UseSystemCPConv},@UnicodeToCP950{$endif},Result);
end;
{$ELSE}
function UTF8ToCP932(const s: string): string;
begin
  Result := UTF8ToDBCS(s, @UnicodeToCP932);
end;

function UTF8ToCP936(const s: string): string;
begin
  Result := UTF8ToDBCS(s, @UnicodeToCP936);
end;

function UTF8ToCP949(const s: string): string;
begin
  Result := UTF8ToDBCS(s, @UnicodeToCP949);
end;

function UTF8ToCP950(const s: string): string;
begin
  Result := UTF8ToDBCS(s, @UnicodeToCP950);
end;
{$ENDIF}

function UTF8ToDBCS(const s: string; const UTF8CharConvFunc: TUnicodeToCharID): string;
var
  len, i, CharLen: integer;
  Src, Dest: PChar;
  c: char;
  Unicode: longword;
begin
  if s = '' then exit('');
  len := length(s);
  SetLength(Result, len); // DBCS needs at most space as UTF-8
  Src  := PChar(s);
  Dest := PChar(Result);
  repeat
    c := Src^;
    if c < #128 then
    begin
      if (c=#0) and (Src-PChar(s)>=len) then break;
      Dest^ := c;
      Inc(Dest);
      Inc(Src);
    end
    else
    begin
      Unicode := UTF8CodepointToUnicode(Src, CharLen);
      Inc(Src, CharLen);
      i := UTF8CharConvFunc(Unicode);
      //writeln(Format('%X', [i]));
      if i >= 0 then
      begin
        if i > $ff then
        begin
          Dest^ := chr(i shr 8);
          Inc(Dest);
          Dest^ := chr(i);
        end
        else
          Dest^ := chr(i);
        Inc(Dest);
      end
      else
      case ConvertEncodingErrorMode of
        ceemSkip:
          begin end;
        ceemException:
          raise EConvertError.Create('Cannot convert UTF8 to DBCS code page');
        ceemReplace:
          begin
            Dest^ := '?';
            Inc(Dest);
          end;
        ceemReturnEmpty:
          Exit('');
      end;
    end;
  until false;
  //SetLength(Result, Dest - PChar(Result));
  SetLength(Result, {%H-}PtrUInt(Dest) - PtrUInt(Result));
end;
asiancodepagefunctions.inc (11,609 bytes)   

Juha Manninen

2020-12-14 13:51

developer   ~0127606

That is not a patch.

Bart Broersma

2020-12-14 14:51

developer   ~0127610

Last edited: 2020-12-14 14:53

View 3 revisions

@powerpcer: first of all: thanks for your efforts.
Th reason we keep asking for a proper patch is that, when you just present us the complete .inc file after your changes, it is very hard for the developers to see what has changed.
Therefor we ask you to provide a patch file.
If you have Lazarus trunk under version control (svn) then you do that as follows:
1. Open a console window (start->cmd.exe)
2. cd \path\to\lazarus_sources
3. svn diff > asiancodepages.diff
4. attach the file asiancodepages.diff here

If you do not have Lazarus trunk under version control you stil can use the diff utility that is provided with fpc.
You need the original version of the file (save that as asiancodepages.original.inc) and the modifed version.
Then:
1. Open a console window (start->cmd.exe)
2. cd \path\to\lazarus_sources\folder_where_asiancodepages.inc_is_located
3. diff -u asiancodepages.original.inc asiancodepages.inc > asiancodepages.diff
4. attach the file asiancodepages.diff here

Please remove the "//modifications made by ... " lines.
All this (who did it and why) will be stored in the svn log.

Note: the patch file muste be created against Lazarus trunk.

powerpcer

2020-12-14 23:26

reporter   ~0127613

done.
asiancodepagefunctions.diff (6,965 bytes)   
--- asiancodepagefunctions.original.inc	Tue Dec 15 06:22:00 2020
+++ asiancodepagefunctions.inc	Tue Dec 15 06:18:01 2020
@@ -11,13 +11,17 @@
   Asian Unicode Functions.
   The clipboard is able to work with the windows and gtk behaviour/features.
 }
-
+//Modified by powerpcer
 function DBCSToUTF8(const s: string; const ArrayUni, ArrayCP: array of word): string;
+const
+  cp936unodered:array[0..254] of Uint32=($a2ab,$a2ac,$a2ad,$a2ae,$a2af,$a2b0,$a2e3,$a2e4,$a2ef,$a2f0,$a2fd,$a2fe,$a4f4,$a4f5,$a4f6,$a4f7,$a4f8,$a4f9,$a4fa,$a4fb,$a4fc,$a4fd,$a4fe,$a5f7,$a5f8,$a5f9,$a5fa,$a5fb,$a5fc,$a5fd,$a5fe,$a6b9,$a6ba,$a6bb,$a6bc,$a6bd,$a6be,$a6bf,$a6c0,$a6d9,$a6da,$a6db,$a6dc,$a6dd,$a6de,$a6df,$a6ec,$a6ed,$a6f3,$a6f6,$a6f7,$a6f8,$a6f9,$a6fa,$a6fb,$a6fc,$a6fd,$a6fe,$a7c2,$a7c3,$a7c4,$a7c5,$a7c6,$a7c7,$a7c8,$a7c9,$a7ca,$a7cb,$a7cc,$a7cd,$a7ce,$a7cf,$a7d0,$a7f2,$a7f3,$a7f4,$a7f5,$a7f6,$a7f7,$a7f8,$a7f9,$a7fa,$a7fb,$a7fc,$a7fd,$a7fe,$a896,$a897,$a898,$a899,$a89a,$a89b,$a89c,$a89d,$a89e,$a89f,$a8a0,$a8bc,$a8bf,$a8c1,$a8c2,$a8c3,$a8c4,$a8ea,$a8eb,$a8ec,$a8ed,$a8ee,$a8ef,$a8f0,$a8f1,$a8f2,$a8f3,$a8f4,$a8f5,$a8f6,$a8f7,$a8f8,$a8f9,$a8fa,$a8fb,$a8fc,$a8fd,$a8fe,$a958,$a95b,$a95d,$a95e,$a95f,$a989,$a98a,$a98b,$a98c,$a98d,$a98e,$a98f,$a990,$a991,$a992,$a993,$a994,$a995,$a997,$a998,$a999,$a99a,$a99b,$a99c,$a99d,$a99e,$a99f,$a9a0,$a9a1,$a9a2,$a9a3,$a9f0,$a9f1,$a9f2,$a9f3,$a9f4,$a9f5,$a9f6,$a9f7,$a9f8,$a9f9,$a9fa,$a9fb,$a9fc,$a9fd,$a9fe,$d7fa,$d7fb,$d7fc,$d7fd,$d7fe,$fe50,$fe51,$fe52,$fe53,$fe54,$fe55,$fe56,$fe57,$fe58,$fe59,$fe5a,$fe5b,$fe5c,$fe5d,$fe5e,$fe5f,$fe60,$fe61,$fe62,$fe63,$fe64,$fe65,$fe66,$fe67,$fe68,$fe69,$fe6a,$fe6b,$fe6c,$fe6d,$fe6e,$fe6f,$fe70,$fe71,$fe72,$fe73,$fe74,$fe75,$fe76,$fe77,$fe78,$fe79,$fe7a,$fe7b,$fe7c,$fe7d,$fe7e,$fe80,$fe81,$fe82,$fe83,$fe84,$fe85,$fe86,$fe87,$fe88,$fe89,$fe8a,$fe8b,$fe8c,$fe8d,$fe8e,$fe8f,$fe90,$fe91,$fe92,$fe93,$fe94,$fe95,$fe96,$fe97,$fe98,$fe99,$fe9a,$fe9b,$fe9c,$fe9d,$fe9e,$fe9f,$fea0);
+  cp936unoderedstart:Uint32=$e766;
 var
-  len, l: Integer;
+  len, l,i: Integer;
   Src, Dest: PChar;
   c: char;
-  code: word;
+  code,code1: word;
+  Hbyte,Lbyte:byte;
 begin
   if s = '' then exit('');
   len := length(s);
@@ -26,7 +30,9 @@
   Dest := PChar(Result);
   repeat
     c := Src^;
+    Hbyte := byte(Src^);
     Inc(Src);
+
     if Ord(c) < 128 then
     begin
       if (c=#0) and (Src-PChar(s)>=len) then break;
@@ -37,11 +43,123 @@
     begin
       code := Byte(c) shl 8;
       c:=Src^;
+      Lbyte := byte(Src^);
       if (c=#0) and (Src-PChar(s)>=len) then break;
       code := code + Byte(c);
+      code1:=code;
       Inc(Src);
 
       code := ArrayUni[SearchTable(ArrayCP, code)];
+      if code=0 then
+      begin
+        case CodeP of
+          936:
+            begin
+
+              if (HByte>=$AA) and (HByte<=$AF) and (LByte>=$A1) and (LByte<=$FE) then
+              begin
+                   code:= $e000 + (94 * (HByte-$aa)) + (LByte-$a1);
+              end
+              else
+              if (HByte>=$F8) and (HByte<=$FE) and (LByte>=$A1) and (LByte<=$FE) then
+              begin
+                   code:= $e234 + (94 * (HByte-$f8)) + (LByte-$a1);
+              end
+              else
+              if (HByte>=$A1) and (HByte<=$A7) and (LByte>=$40) and (LByte<=$A0) then
+              begin
+                   code:= $e4C6 + (97 * (HByte-$A1)) + (LByte-$40);
+              end
+              else
+              begin
+                for i:=0 to length(cp936unodered)-1 do
+                begin
+                  if code1=cp936unodered[i] then
+                  begin
+                       code:=cp936unoderedstart+i;
+                       break;
+                  end;
+                end;
+              end;
+            end;
+          950:
+            begin
+                 if (HByte>=$81) and (HByte<=$8d) and (LByte>=$40) and (LByte<=$FE) then
+                 begin
+                      code:= $eeb8 + (157 * (Hbyte-$81)) ;
+                      if (Lbyte<$80) then
+                       code:=code + (Lbyte-$40)
+                      else
+                       code:=code + (Lbyte-$62);
+                 end
+                 else
+                 if (HByte>=$8e) and (HByte<=$a0) and (LByte>=$40) and (LByte<=$FE) then
+                 begin
+                   code:= $e311 + (157 * (Hbyte-$8e));
+                   if (Lbyte<$80) then
+                    code:=code + (Lbyte-$40)
+                   else
+                    code:=code + (Lbyte-$62);
+                 end
+                 else
+                 if (HByte>=$c6) and (HByte<=$c8) and (LByte>=$a1) and (LByte<=$FE) then
+                 begin
+                   code:= $f672 + (157 * (Hbyte-$c6)) ;
+                   if (Lbyte<$80) then
+                    code:=code + (Lbyte-$40)
+                   else
+                    code:=code + (Lbyte-$62);
+                 end
+                 else
+                 if (HByte>=$fa) and (HByte<=$fe) and (LByte>=$40) and (LByte<=$FE) then
+                 begin
+                   code:= $e000 + (157 * (Hbyte-$fa)) ;
+                   if (Lbyte<$80) then
+                    code:=code + (Lbyte-$40)
+                   else
+                    code:=code + (Lbyte-$62);
+                 end;
+            end;
+          949:
+            begin
+                 if (HByte=$c9) and (LByte>=$a1) and (LByte<=$fe) then
+                 begin
+                      code:= $e000 + (94 * (HByte-$c9)) + (LByte-$a1) ;
+                 end
+                 else
+                 if (HByte=$fe) and (LByte>=$a1) and (LByte<=$fe) then
+                 begin
+                      code:= $e05e + (94 * (HByte-$fe)) + (LByte-$a1) ;
+                 end
+                 else
+                 if code1=$ff then
+                 begin
+                      code:= $f8f7;
+                 end;
+            end;
+          932:
+            begin
+                 if (HByte>=$f0) and (HByte<=$f9) and (LByte>=$40) and (LByte<=$fc) then
+                 begin
+                      code:= $e000 + (188 * (HByte-$f0)) + (LByte-$40) ;
+                      if LByte>$7f then
+                         code:=code-1;
+                 end
+                 else
+                 begin
+                   case code1 of
+                     $00a0:code:=$f8f0;
+                     $00fd:code:=$f8f1;
+                     $00fe:code:=$f8f2;
+                     $00ff:code:=$f8f3;
+                   end;
+
+                 end;
+            end
+          else
+            code := 0;
+        end;
+      end;
       if code>0 then
       begin
         l:=UnicodeToUTF8Inline(code,Dest);
@@ -241,4 +359,3 @@
   //SetLength(Result, Dest - PChar(Result));
   SetLength(Result, {%H-}PtrUInt(Dest) - PtrUInt(Result));
 end;
-
asiancodepagefunctions.diff (6,965 bytes)   
asiancodepagefunctions-2.inc (11,586 bytes)   
{%MainUnit ../lconvencoding.pp}

{
 *****************************************************************************
  This file is part of LazUtils.

  See the file COPYING.modifiedLGPL.txt, included in this distribution,
  for details about the license.
 *****************************************************************************

  Asian Unicode Functions.
  The clipboard is able to work with the windows and gtk behaviour/features.
}

function DBCSToUTF8(const s: string; const ArrayUni, ArrayCP: array of word): string;
const
  cp936unodered:array[0..254] of Uint32=($a2ab,$a2ac,$a2ad,$a2ae,$a2af,$a2b0,$a2e3,$a2e4,$a2ef,$a2f0,$a2fd,$a2fe,$a4f4,$a4f5,$a4f6,$a4f7,$a4f8,$a4f9,$a4fa,$a4fb,$a4fc,$a4fd,$a4fe,$a5f7,$a5f8,$a5f9,$a5fa,$a5fb,$a5fc,$a5fd,$a5fe,$a6b9,$a6ba,$a6bb,$a6bc,$a6bd,$a6be,$a6bf,$a6c0,$a6d9,$a6da,$a6db,$a6dc,$a6dd,$a6de,$a6df,$a6ec,$a6ed,$a6f3,$a6f6,$a6f7,$a6f8,$a6f9,$a6fa,$a6fb,$a6fc,$a6fd,$a6fe,$a7c2,$a7c3,$a7c4,$a7c5,$a7c6,$a7c7,$a7c8,$a7c9,$a7ca,$a7cb,$a7cc,$a7cd,$a7ce,$a7cf,$a7d0,$a7f2,$a7f3,$a7f4,$a7f5,$a7f6,$a7f7,$a7f8,$a7f9,$a7fa,$a7fb,$a7fc,$a7fd,$a7fe,$a896,$a897,$a898,$a899,$a89a,$a89b,$a89c,$a89d,$a89e,$a89f,$a8a0,$a8bc,$a8bf,$a8c1,$a8c2,$a8c3,$a8c4,$a8ea,$a8eb,$a8ec,$a8ed,$a8ee,$a8ef,$a8f0,$a8f1,$a8f2,$a8f3,$a8f4,$a8f5,$a8f6,$a8f7,$a8f8,$a8f9,$a8fa,$a8fb,$a8fc,$a8fd,$a8fe,$a958,$a95b,$a95d,$a95e,$a95f,$a989,$a98a,$a98b,$a98c,$a98d,$a98e,$a98f,$a990,$a991,$a992,$a993,$a994,$a995,$a997,$a998,$a999,$a99a,$a99b,$a99c,$a99d,$a99e,$a99f,$a9a0,$a9a1,$a9a2,$a9a3,$a9f0,$a9f1,$a9f2,$a9f3,$a9f4,$a9f5,$a9f6,$a9f7,$a9f8,$a9f9,$a9fa,$a9fb,$a9fc,$a9fd,$a9fe,$d7fa,$d7fb,$d7fc,$d7fd,$d7fe,$fe50,$fe51,$fe52,$fe53,$fe54,$fe55,$fe56,$fe57,$fe58,$fe59,$fe5a,$fe5b,$fe5c,$fe5d,$fe5e,$fe5f,$fe60,$fe61,$fe62,$fe63,$fe64,$fe65,$fe66,$fe67,$fe68,$fe69,$fe6a,$fe6b,$fe6c,$fe6d,$fe6e,$fe6f,$fe70,$fe71,$fe72,$fe73,$fe74,$fe75,$fe76,$fe77,$fe78,$fe79,$fe7a,$fe7b,$fe7c,$fe7d,$fe7e,$fe80,$fe81,$fe82,$fe83,$fe84,$fe85,$fe86,$fe87,$fe88,$fe89,$fe8a,$fe8b,$fe8c,$fe8d,$fe8e,$fe8f,$fe90,$fe91,$fe92,$fe93,$fe94,$fe95,$fe96,$fe97,$fe98,$fe99,$fe9a,$fe9b,$fe9c,$fe9d,$fe9e,$fe9f,$fea0);
  cp936unoderedstart:Uint32=$e766;
var
  len, l,i: Integer;
  Src, Dest: PChar;
  c: char;
  code,code1: word;
  Hbyte,Lbyte:byte;
begin
  if s = '' then exit('');
  len := length(s);
  SetLength(Result, len * 4);// Asia UTF-8 is at most 4 bytes
  Src  := PChar(s);
  Dest := PChar(Result);
  repeat
    c := Src^;
    Hbyte := byte(Src^);
    Inc(Src);

    if Ord(c) < 128 then
    begin
      if (c=#0) and (Src-PChar(s)>=len) then break;
      Dest^ := c;
      Inc(Dest);
    end
    else
    begin
      code := Byte(c) shl 8;
      c:=Src^;
      Lbyte := byte(Src^);
      if (c=#0) and (Src-PChar(s)>=len) then break;
      code := code + Byte(c);
      code1:=code;
      Inc(Src);

      code := ArrayUni[SearchTable(ArrayCP, code)];
      if code=0 then
      begin
        case CodeP of
          936:
            begin

              if (HByte>=$AA) and (HByte<=$AF) and (LByte>=$A1) and (LByte<=$FE) then
              begin
                   code:= $e000 + (94 * (HByte-$aa)) + (LByte-$a1);
              end
              else
              if (HByte>=$F8) and (HByte<=$FE) and (LByte>=$A1) and (LByte<=$FE) then
              begin
                   code:= $e234 + (94 * (HByte-$f8)) + (LByte-$a1);
              end
              else
              if (HByte>=$A1) and (HByte<=$A7) and (LByte>=$40) and (LByte<=$A0) then
              begin
                   code:= $e4C6 + (97 * (HByte-$A1)) + (LByte-$40);
              end
              else
              begin
                for i:=0 to length(cp936unodered)-1 do
                begin
                  if code1=cp936unodered[i] then
                  begin
                       code:=cp936unoderedstart+i;
                       break;
                  end;
                end;
              end;
            end;
          950:
            begin
                 if (HByte>=$81) and (HByte<=$8d) and (LByte>=$40) and (LByte<=$FE) then
                 begin
                      code:= $eeb8 + (157 * (Hbyte-$81)) ;
                      if (Lbyte<$80) then
                       code:=code + (Lbyte-$40)
                      else
                       code:=code + (Lbyte-$62);
                 end
                 else
                 if (HByte>=$8e) and (HByte<=$a0) and (LByte>=$40) and (LByte<=$FE) then
                 begin
                   code:= $e311 + (157 * (Hbyte-$8e));
                   if (Lbyte<$80) then
                    code:=code + (Lbyte-$40)
                   else
                    code:=code + (Lbyte-$62);
                 end
                 else
                 if (HByte>=$c6) and (HByte<=$c8) and (LByte>=$a1) and (LByte<=$FE) then
                 begin
                   code:= $f672 + (157 * (Hbyte-$c6)) ;
                   if (Lbyte<$80) then
                    code:=code + (Lbyte-$40)
                   else
                    code:=code + (Lbyte-$62);
                 end
                 else
                 if (HByte>=$fa) and (HByte<=$fe) and (LByte>=$40) and (LByte<=$FE) then
                 begin
                   code:= $e000 + (157 * (Hbyte-$fa)) ;
                   if (Lbyte<$80) then
                    code:=code + (Lbyte-$40)
                   else
                    code:=code + (Lbyte-$62);
                 end;
            end;
          949:
            begin
                 if (HByte=$c9) and (LByte>=$a1) and (LByte<=$fe) then
                 begin
                      code:= $e000 + (94 * (HByte-$c9)) + (LByte-$a1) ;
                 end
                 else
                 if (HByte=$fe) and (LByte>=$a1) and (LByte<=$fe) then
                 begin
                      code:= $e05e + (94 * (HByte-$fe)) + (LByte-$a1) ;
                 end
                 else
                 if code1=$ff then
                 begin
                      code:= $f8f7;
                 end;
            end;
          932:
            begin
                 if (HByte>=$f0) and (HByte<=$f9) and (LByte>=$40) and (LByte<=$fc) then
                 begin
                      code:= $e000 + (188 * (HByte-$f0)) + (LByte-$40) ;
                      if LByte>$7f then
                         code:=code-1;
                 end
                 else
                 begin
                   case code1 of
                     $00a0:code:=$f8f0;
                     $00fd:code:=$f8f1;
                     $00fe:code:=$f8f2;
                     $00ff:code:=$f8f3;
                   end;

                 end;
            end
          else
            code := 0;
        end;
      end;
      if code>0 then
      begin
        l:=UnicodeToUTF8Inline(code,Dest);
        inc(Dest,l);
      end
      else
      case ConvertEncodingErrorMode of
        ceemSkip:
          begin end;
        ceemException:
          raise EConvertError.Create('Cannot convert DBCS code page to UTF8');
        ceemReplace:
          begin
            Dest^:='?';
            Inc(Dest);
          end;
        ceemReturnEmpty:
          Exit('');
      end;
    end;
  until false;
  SetLength(Result, {%H-}PtrUInt(Dest) - PtrUInt(Result));
end;

function CP936ToUTF8(const s: string): string;
begin
  Result := DBCSToUTF8(s, Uni936C, CP936CC);
end;

function CP950ToUTF8(const s: string): string;
begin
  Result := DBCSToUTF8(s, Uni950C, CP950CC);
end;

function CP949ToUTF8(const s: string): string;
begin
  Result := DBCSToUTF8(s, Uni949C, CP949CC);
end;

function CP932ToUTF8(const s: string): string;
begin
  Result := DBCSToUTF8(s, Uni932C, CP932CC);
end;

{$IfNDef UseSystemCPConv}
function UnicodeToCP936(Unicode: cardinal): integer;
begin
  case Unicode of
    0..127: Result := Unicode;
    else
      Result := CP936CU[SearchTable(Uni936U, Unicode)];
  end;
end;

function UnicodeToCP950(Unicode: cardinal): integer;
begin
  case Unicode of
    0..127: Result := Unicode;
    else
      Result := CP950CU[SearchTable(Uni950U, Unicode)];
  end;
end;

function UnicodeToCP949(Unicode: cardinal): integer;
begin
  case Unicode of
    0..127: Result := Unicode;
    else
      Result := CP949CU[SearchTable(Uni949U, Unicode)];
  end;
end;

function UnicodeToCP932(Unicode: cardinal): integer;
begin
  case Unicode of
    0..127: Result := Unicode;
    else
      Result := CP932CU[SearchTable(Uni932U, Unicode)];
  end;
end;
{$endif}

{$ifdef FPC_HAS_CPSTRING}
procedure InternalUTF8ToDBCS(const s: string; TargetCodePage: TSystemCodePage;
  SetTargetCodePage: boolean;
  {$IfNDef UseSystemCPConv}const UTF8CharConvFunc: TUnicodeToCharID;{$endif}
  out TheResult: RawByteString); inline;
begin
  {$ifdef UseSystemCPConv}
  TheResult:=s;
  SetCodePage(TheResult, TargetCodePage, True);
  if not SetTargetCodePage then
    SetCodePage(TheResult, CP_ACP, False);
  {$else}
  TheResult:=UTF8ToDBCS(s,UTF8CharConvFunc);
  if SetTargetCodePage then
    SetCodePage(TheResult, TargetCodePage, False);
  {$endif}
end;

function UTF8ToCP932(const s: string; SetTargetCodePage: boolean): RawByteString;
begin
  InternalUTF8ToDBCS(s,932,SetTargetCodePage{$IfNDef UseSystemCPConv},@UnicodeToCP932{$endif},Result);
end;

function UTF8ToCP936(const s: string; SetTargetCodePage: boolean): RawByteString;
begin
  InternalUTF8ToDBCS(s,936,SetTargetCodePage{$IfNDef UseSystemCPConv},@UnicodeToCP936{$endif},Result);
end;

function UTF8ToCP949(const s: string; SetTargetCodePage: boolean): RawByteString;
begin
  InternalUTF8ToDBCS(s,949,SetTargetCodePage{$IfNDef UseSystemCPConv},@UnicodeToCP949{$endif},Result);
end;

function UTF8ToCP950(const s: string; SetTargetCodePage: boolean): RawByteString;
begin
  InternalUTF8ToDBCS(s,950,SetTargetCodePage{$IfNDef UseSystemCPConv},@UnicodeToCP950{$endif},Result);
end;
{$ELSE}
function UTF8ToCP932(const s: string): string;
begin
  Result := UTF8ToDBCS(s, @UnicodeToCP932);
end;

function UTF8ToCP936(const s: string): string;
begin
  Result := UTF8ToDBCS(s, @UnicodeToCP936);
end;

function UTF8ToCP949(const s: string): string;
begin
  Result := UTF8ToDBCS(s, @UnicodeToCP949);
end;

function UTF8ToCP950(const s: string): string;
begin
  Result := UTF8ToDBCS(s, @UnicodeToCP950);
end;
{$ENDIF}

function UTF8ToDBCS(const s: string; const UTF8CharConvFunc: TUnicodeToCharID): string;
var
  len, i, CharLen: integer;
  Src, Dest: PChar;
  c: char;
  Unicode: longword;
begin
  if s = '' then exit('');
  len := length(s);
  SetLength(Result, len); // DBCS needs at most space as UTF-8
  Src  := PChar(s);
  Dest := PChar(Result);
  repeat
    c := Src^;
    if c < #128 then
    begin
      if (c=#0) and (Src-PChar(s)>=len) then break;
      Dest^ := c;
      Inc(Dest);
      Inc(Src);
    end
    else
    begin
      Unicode := UTF8CodepointToUnicode(Src, CharLen);
      Inc(Src, CharLen);
      i := UTF8CharConvFunc(Unicode);
      //writeln(Format('%X', [i]));
      if i >= 0 then
      begin
        if i > $ff then
        begin
          Dest^ := chr(i shr 8);
          Inc(Dest);
          Dest^ := chr(i);
        end
        else
          Dest^ := chr(i);
        Inc(Dest);
      end
      else
      case ConvertEncodingErrorMode of
        ceemSkip:
          begin end;
        ceemException:
          raise EConvertError.Create('Cannot convert UTF8 to DBCS code page');
        ceemReplace:
          begin
            Dest^ := '?';
            Inc(Dest);
          end;
        ceemReturnEmpty:
          Exit('');
      end;
    end;
  until false;
  //SetLength(Result, Dest - PChar(Result));
  SetLength(Result, {%H-}PtrUInt(Dest) - PtrUInt(Result));
end;
asiancodepagefunctions-2.inc (11,586 bytes)   

Juha Manninen

2020-12-15 19:25

developer   ~0127625

Ok, it looks like a substantial patch. However it does not compile because of undefined "CodeP".

When we get a patch that compiles, how to test it? I guess it will be difficult with a European locale. In that case I will ask feedback from others and commit the change if nobody objects.

powerpcer

2020-12-16 00:20

reporter   ~0127627

ok, i fix that.
asiancodepagefunctions-3.inc (11,618 bytes)   
{%MainUnit ../lconvencoding.pp}

{
 *****************************************************************************
  This file is part of LazUtils.

  See the file COPYING.modifiedLGPL.txt, included in this distribution,
  for details about the license.
 *****************************************************************************

  Asian Unicode Functions.
  The clipboard is able to work with the windows and gtk behaviour/features.
}

function DBCSToUTF8(const s: string; const ArrayUni, ArrayCP: array of word; CodeP: integer): string;
const
  cp936unodered:array[0..254] of Uint32=($a2ab,$a2ac,$a2ad,$a2ae,$a2af,$a2b0,$a2e3,$a2e4,$a2ef,$a2f0,$a2fd,$a2fe,$a4f4,$a4f5,$a4f6,$a4f7,$a4f8,$a4f9,$a4fa,$a4fb,$a4fc,$a4fd,$a4fe,$a5f7,$a5f8,$a5f9,$a5fa,$a5fb,$a5fc,$a5fd,$a5fe,$a6b9,$a6ba,$a6bb,$a6bc,$a6bd,$a6be,$a6bf,$a6c0,$a6d9,$a6da,$a6db,$a6dc,$a6dd,$a6de,$a6df,$a6ec,$a6ed,$a6f3,$a6f6,$a6f7,$a6f8,$a6f9,$a6fa,$a6fb,$a6fc,$a6fd,$a6fe,$a7c2,$a7c3,$a7c4,$a7c5,$a7c6,$a7c7,$a7c8,$a7c9,$a7ca,$a7cb,$a7cc,$a7cd,$a7ce,$a7cf,$a7d0,$a7f2,$a7f3,$a7f4,$a7f5,$a7f6,$a7f7,$a7f8,$a7f9,$a7fa,$a7fb,$a7fc,$a7fd,$a7fe,$a896,$a897,$a898,$a899,$a89a,$a89b,$a89c,$a89d,$a89e,$a89f,$a8a0,$a8bc,$a8bf,$a8c1,$a8c2,$a8c3,$a8c4,$a8ea,$a8eb,$a8ec,$a8ed,$a8ee,$a8ef,$a8f0,$a8f1,$a8f2,$a8f3,$a8f4,$a8f5,$a8f6,$a8f7,$a8f8,$a8f9,$a8fa,$a8fb,$a8fc,$a8fd,$a8fe,$a958,$a95b,$a95d,$a95e,$a95f,$a989,$a98a,$a98b,$a98c,$a98d,$a98e,$a98f,$a990,$a991,$a992,$a993,$a994,$a995,$a997,$a998,$a999,$a99a,$a99b,$a99c,$a99d,$a99e,$a99f,$a9a0,$a9a1,$a9a2,$a9a3,$a9f0,$a9f1,$a9f2,$a9f3,$a9f4,$a9f5,$a9f6,$a9f7,$a9f8,$a9f9,$a9fa,$a9fb,$a9fc,$a9fd,$a9fe,$d7fa,$d7fb,$d7fc,$d7fd,$d7fe,$fe50,$fe51,$fe52,$fe53,$fe54,$fe55,$fe56,$fe57,$fe58,$fe59,$fe5a,$fe5b,$fe5c,$fe5d,$fe5e,$fe5f,$fe60,$fe61,$fe62,$fe63,$fe64,$fe65,$fe66,$fe67,$fe68,$fe69,$fe6a,$fe6b,$fe6c,$fe6d,$fe6e,$fe6f,$fe70,$fe71,$fe72,$fe73,$fe74,$fe75,$fe76,$fe77,$fe78,$fe79,$fe7a,$fe7b,$fe7c,$fe7d,$fe7e,$fe80,$fe81,$fe82,$fe83,$fe84,$fe85,$fe86,$fe87,$fe88,$fe89,$fe8a,$fe8b,$fe8c,$fe8d,$fe8e,$fe8f,$fe90,$fe91,$fe92,$fe93,$fe94,$fe95,$fe96,$fe97,$fe98,$fe99,$fe9a,$fe9b,$fe9c,$fe9d,$fe9e,$fe9f,$fea0);
  cp936unoderedstart:Uint32=$e766;
var
  len, l,i: Integer;
  Src, Dest: PChar;
  c: char;
  code,code1: word;
  Hbyte,Lbyte:byte;
begin
  if s = '' then exit('');
  len := length(s);
  SetLength(Result, len * 4);// Asia UTF-8 is at most 4 bytes
  Src  := PChar(s);
  Dest := PChar(Result);
  repeat
    c := Src^;
    Hbyte := byte(Src^);
    Inc(Src);

    if Ord(c) < 128 then
    begin
      if (c=#0) and (Src-PChar(s)>=len) then break;
      Dest^ := c;
      Inc(Dest);
    end
    else
    begin
      code := Byte(c) shl 8;
      c:=Src^;
      Lbyte := byte(Src^);
      if (c=#0) and (Src-PChar(s)>=len) then break;
      code := code + Byte(c);
      code1:=code;
      Inc(Src);

      code := ArrayUni[SearchTable(ArrayCP, code)];
      if code=0 then
      begin
        case CodeP of
          936:
            begin

              if (HByte>=$AA) and (HByte<=$AF) and (LByte>=$A1) and (LByte<=$FE) then
              begin
                   code:= $e000 + (94 * (HByte-$aa)) + (LByte-$a1);
              end
              else
              if (HByte>=$F8) and (HByte<=$FE) and (LByte>=$A1) and (LByte<=$FE) then
              begin
                   code:= $e234 + (94 * (HByte-$f8)) + (LByte-$a1);
              end
              else
              if (HByte>=$A1) and (HByte<=$A7) and (LByte>=$40) and (LByte<=$A0) then
              begin
                   code:= $e4C6 + (97 * (HByte-$A1)) + (LByte-$40);
              end
              else
              begin
                for i:=0 to length(cp936unodered)-1 do
                begin
                  if code1=cp936unodered[i] then
                  begin
                       code:=cp936unoderedstart+i;
                       break;
                  end;
                end;
              end;
            end;
          950:
            begin
                 if (HByte>=$81) and (HByte<=$8d) and (LByte>=$40) and (LByte<=$FE) then
                 begin
                      code:= $eeb8 + (157 * (Hbyte-$81)) ;
                      if (Lbyte<$80) then
                       code:=code + (Lbyte-$40)
                      else
                       code:=code + (Lbyte-$62);
                 end
                 else
                 if (HByte>=$8e) and (HByte<=$a0) and (LByte>=$40) and (LByte<=$FE) then
                 begin
                   code:= $e311 + (157 * (Hbyte-$8e));
                   if (Lbyte<$80) then
                    code:=code + (Lbyte-$40)
                   else
                    code:=code + (Lbyte-$62);
                 end
                 else
                 if (HByte>=$c6) and (HByte<=$c8) and (LByte>=$a1) and (LByte<=$FE) then
                 begin
                   code:= $f672 + (157 * (Hbyte-$c6)) ;
                   if (Lbyte<$80) then
                    code:=code + (Lbyte-$40)
                   else
                    code:=code + (Lbyte-$62);
                 end
                 else
                 if (HByte>=$fa) and (HByte<=$fe) and (LByte>=$40) and (LByte<=$FE) then
                 begin
                   code:= $e000 + (157 * (Hbyte-$fa)) ;
                   if (Lbyte<$80) then
                    code:=code + (Lbyte-$40)
                   else
                    code:=code + (Lbyte-$62);
                 end;
            end;
          949:
            begin
                 if (HByte=$c9) and (LByte>=$a1) and (LByte<=$fe) then
                 begin
                      code:= $e000 + (94 * (HByte-$c9)) + (LByte-$a1) ;
                 end
                 else
                 if (HByte=$fe) and (LByte>=$a1) and (LByte<=$fe) then
                 begin
                      code:= $e05e + (94 * (HByte-$fe)) + (LByte-$a1) ;
                 end
                 else
                 if code1=$ff then
                 begin
                      code:= $f8f7;
                 end;
            end;
          932:
            begin
                 if (HByte>=$f0) and (HByte<=$f9) and (LByte>=$40) and (LByte<=$fc) then
                 begin
                      code:= $e000 + (188 * (HByte-$f0)) + (LByte-$40) ;
                      if LByte>$7f then
                         code:=code-1;
                 end
                 else
                 begin
                   case code1 of
                     $00a0:code:=$f8f0;
                     $00fd:code:=$f8f1;
                     $00fe:code:=$f8f2;
                     $00ff:code:=$f8f3;
                   end;

                 end;
            end
          else
            code := 0;
        end;
      end;
      if code>0 then
      begin
        l:=UnicodeToUTF8Inline(code,Dest);
        inc(Dest,l);
      end
      else
      case ConvertEncodingErrorMode of
        ceemSkip:
          begin end;
        ceemException:
          raise EConvertError.Create('Cannot convert DBCS code page to UTF8');
        ceemReplace:
          begin
            Dest^:='?';
            Inc(Dest);
          end;
        ceemReturnEmpty:
          Exit('');
      end;
    end;
  until false;
  SetLength(Result, {%H-}PtrUInt(Dest) - PtrUInt(Result));
end;

function CP936ToUTF8(const s: string): string;
begin
  Result := DBCSToUTF8(s, Uni936C, CP936CC,936);
end;

function CP950ToUTF8(const s: string): string;
begin
  Result := DBCSToUTF8(s, Uni950C, CP950CC,950);
end;

function CP949ToUTF8(const s: string): string;
begin
  Result := DBCSToUTF8(s, Uni949C, CP949CC,949);
end;

function CP932ToUTF8(const s: string): string;
begin
  Result := DBCSToUTF8(s, Uni932C, CP932CC,932);
end;

{$IfNDef UseSystemCPConv}
function UnicodeToCP936(Unicode: cardinal): integer;
begin
  case Unicode of
    0..127: Result := Unicode;
    else
      Result := CP936CU[SearchTable(Uni936U, Unicode)];
  end;
end;

function UnicodeToCP950(Unicode: cardinal): integer;
begin
  case Unicode of
    0..127: Result := Unicode;
    else
      Result := CP950CU[SearchTable(Uni950U, Unicode)];
  end;
end;

function UnicodeToCP949(Unicode: cardinal): integer;
begin
  case Unicode of
    0..127: Result := Unicode;
    else
      Result := CP949CU[SearchTable(Uni949U, Unicode)];
  end;
end;

function UnicodeToCP932(Unicode: cardinal): integer;
begin
  case Unicode of
    0..127: Result := Unicode;
    else
      Result := CP932CU[SearchTable(Uni932U, Unicode)];
  end;
end;
{$endif}

{$ifdef FPC_HAS_CPSTRING}
procedure InternalUTF8ToDBCS(const s: string; TargetCodePage: TSystemCodePage;
  SetTargetCodePage: boolean;
  {$IfNDef UseSystemCPConv}const UTF8CharConvFunc: TUnicodeToCharID;{$endif}
  out TheResult: RawByteString); inline;
begin
  {$ifdef UseSystemCPConv}
  TheResult:=s;
  SetCodePage(TheResult, TargetCodePage, True);
  if not SetTargetCodePage then
    SetCodePage(TheResult, CP_ACP, False);
  {$else}
  TheResult:=UTF8ToDBCS(s,UTF8CharConvFunc);
  if SetTargetCodePage then
    SetCodePage(TheResult, TargetCodePage, False);
  {$endif}
end;

function UTF8ToCP932(const s: string; SetTargetCodePage: boolean): RawByteString;
begin
  InternalUTF8ToDBCS(s,932,SetTargetCodePage{$IfNDef UseSystemCPConv},@UnicodeToCP932{$endif},Result);
end;

function UTF8ToCP936(const s: string; SetTargetCodePage: boolean): RawByteString;
begin
  InternalUTF8ToDBCS(s,936,SetTargetCodePage{$IfNDef UseSystemCPConv},@UnicodeToCP936{$endif},Result);
end;

function UTF8ToCP949(const s: string; SetTargetCodePage: boolean): RawByteString;
begin
  InternalUTF8ToDBCS(s,949,SetTargetCodePage{$IfNDef UseSystemCPConv},@UnicodeToCP949{$endif},Result);
end;

function UTF8ToCP950(const s: string; SetTargetCodePage: boolean): RawByteString;
begin
  InternalUTF8ToDBCS(s,950,SetTargetCodePage{$IfNDef UseSystemCPConv},@UnicodeToCP950{$endif},Result);
end;
{$ELSE}
function UTF8ToCP932(const s: string): string;
begin
  Result := UTF8ToDBCS(s, @UnicodeToCP932);
end;

function UTF8ToCP936(const s: string): string;
begin
  Result := UTF8ToDBCS(s, @UnicodeToCP936);
end;

function UTF8ToCP949(const s: string): string;
begin
  Result := UTF8ToDBCS(s, @UnicodeToCP949);
end;

function UTF8ToCP950(const s: string): string;
begin
  Result := UTF8ToDBCS(s, @UnicodeToCP950);
end;
{$ENDIF}

function UTF8ToDBCS(const s: string; const UTF8CharConvFunc: TUnicodeToCharID): string;
var
  len, i, CharLen: integer;
  Src, Dest: PChar;
  c: char;
  Unicode: longword;
begin
  if s = '' then exit('');
  len := length(s);
  SetLength(Result, len); // DBCS needs at most space as UTF-8
  Src  := PChar(s);
  Dest := PChar(Result);
  repeat
    c := Src^;
    if c < #128 then
    begin
      if (c=#0) and (Src-PChar(s)>=len) then break;
      Dest^ := c;
      Inc(Dest);
      Inc(Src);
    end
    else
    begin
      Unicode := UTF8CodepointToUnicode(Src, CharLen);
      Inc(Src, CharLen);
      i := UTF8CharConvFunc(Unicode);
      //writeln(Format('%X', [i]));
      if i >= 0 then
      begin
        if i > $ff then
        begin
          Dest^ := chr(i shr 8);
          Inc(Dest);
          Dest^ := chr(i);
        end
        else
          Dest^ := chr(i);
        Inc(Dest);
      end
      else
      case ConvertEncodingErrorMode of
        ceemSkip:
          begin end;
        ceemException:
          raise EConvertError.Create('Cannot convert UTF8 to DBCS code page');
        ceemReplace:
          begin
            Dest^ := '?';
            Inc(Dest);
          end;
        ceemReturnEmpty:
          Exit('');
      end;
    end;
  until false;
  //SetLength(Result, Dest - PChar(Result));
  SetLength(Result, {%H-}PtrUInt(Dest) - PtrUInt(Result));
end;
asiancodepagefunctions-3.inc (11,618 bytes)   
asiancodepagefunctions-2.diff (7,727 bytes)   
--- asiancodepagefunctions_original.inc	Wed Dec 16 07:18:18 2020
+++ asiancodepagefunctions.inc	Wed Dec 16 07:15:30 2020
@@ -12,12 +12,16 @@
   The clipboard is able to work with the windows and gtk behaviour/features.
 }
 
-function DBCSToUTF8(const s: string; const ArrayUni, ArrayCP: array of word): string;
+function DBCSToUTF8(const s: string; const ArrayUni, ArrayCP: array of word; CodeP: integer): string;
+const
+  cp936unodered:array[0..254] of Uint32=($a2ab,$a2ac,$a2ad,$a2ae,$a2af,$a2b0,$a2e3,$a2e4,$a2ef,$a2f0,$a2fd,$a2fe,$a4f4,$a4f5,$a4f6,$a4f7,$a4f8,$a4f9,$a4fa,$a4fb,$a4fc,$a4fd,$a4fe,$a5f7,$a5f8,$a5f9,$a5fa,$a5fb,$a5fc,$a5fd,$a5fe,$a6b9,$a6ba,$a6bb,$a6bc,$a6bd,$a6be,$a6bf,$a6c0,$a6d9,$a6da,$a6db,$a6dc,$a6dd,$a6de,$a6df,$a6ec,$a6ed,$a6f3,$a6f6,$a6f7,$a6f8,$a6f9,$a6fa,$a6fb,$a6fc,$a6fd,$a6fe,$a7c2,$a7c3,$a7c4,$a7c5,$a7c6,$a7c7,$a7c8,$a7c9,$a7ca,$a7cb,$a7cc,$a7cd,$a7ce,$a7cf,$a7d0,$a7f2,$a7f3,$a7f4,$a7f5,$a7f6,$a7f7,$a7f8,$a7f9,$a7fa,$a7fb,$a7fc,$a7fd,$a7fe,$a896,$a897,$a898,$a899,$a89a,$a89b,$a89c,$a89d,$a89e,$a89f,$a8a0,$a8bc,$a8bf,$a8c1,$a8c2,$a8c3,$a8c4,$a8ea,$a8eb,$a8ec,$a8ed,$a8ee,$a8ef,$a8f0,$a8f1,$a8f2,$a8f3,$a8f4,$a8f5,$a8f6,$a8f7,$a8f8,$a8f9,$a8fa,$a8fb,$a8fc,$a8fd,$a8fe,$a958,$a95b,$a95d,$a95e,$a95f,$a989,$a98a,$a98b,$a98c,$a98d,$a98e,$a98f,$a990,$a991,$a992,$a993,$a994,$a995,$a997,$a998,$a999,$a99a,$a99b,$a99c,$a99d,$a99e,$a99f,$a9a0,$a9a1,$a9a2,$a9a3,$a9f0,$a9f1,$a9f2,$a9f3,$a9f4,$a9f5,$a9f6,$a9f7,$a9f8,$a9f9,$a9fa,$a9fb,$a9fc,$a9fd,$a9fe,$d7fa,$d7fb,$d7fc,$d7fd,$d7fe,$fe50,$fe51,$fe52,$fe53,$fe54,$fe55,$fe56,$fe57,$fe58,$fe59,$fe5a,$fe5b,$fe5c,$fe5d,$fe5e,$fe5f,$fe60,$fe61,$fe62,$fe63,$fe64,$fe65,$fe66,$fe67,$fe68,$fe69,$fe6a,$fe6b,$fe6c,$fe6d,$fe6e,$fe6f,$fe70,$fe71,$fe72,$fe73,$fe74,$fe75,$fe76,$fe77,$fe78,$fe79,$fe7a,$fe7b,$fe7c,$fe7d,$fe7e,$fe80,$fe81,$fe82,$fe83,$fe84,$fe85,$fe86,$fe87,$fe88,$fe89,$fe8a,$fe8b,$fe8c,$fe8d,$fe8e,$fe8f,$fe90,$fe91,$fe92,$fe93,$fe94,$fe95,$fe96,$fe97,$fe98,$fe99,$fe9a,$fe9b,$fe9c,$fe9d,$fe9e,$fe9f,$fea0);
+  cp936unoderedstart:Uint32=$e766;
 var
-  len, l: Integer;
+  len, l,i: Integer;
   Src, Dest: PChar;
   c: char;
-  code: word;
+  code,code1: word;
+  Hbyte,Lbyte:byte;
 begin
   if s = '' then exit('');
   len := length(s);
@@ -26,7 +30,9 @@
   Dest := PChar(Result);
   repeat
     c := Src^;
+    Hbyte := byte(Src^);
     Inc(Src);
+
     if Ord(c) < 128 then
     begin
       if (c=#0) and (Src-PChar(s)>=len) then break;
@@ -37,11 +43,123 @@
     begin
       code := Byte(c) shl 8;
       c:=Src^;
+      Lbyte := byte(Src^);
       if (c=#0) and (Src-PChar(s)>=len) then break;
       code := code + Byte(c);
+      code1:=code;
       Inc(Src);
 
       code := ArrayUni[SearchTable(ArrayCP, code)];
+      if code=0 then
+      begin
+        case CodeP of
+          936:
+            begin
+
+              if (HByte>=$AA) and (HByte<=$AF) and (LByte>=$A1) and (LByte<=$FE) then
+              begin
+                   code:= $e000 + (94 * (HByte-$aa)) + (LByte-$a1);
+              end
+              else
+              if (HByte>=$F8) and (HByte<=$FE) and (LByte>=$A1) and (LByte<=$FE) then
+              begin
+                   code:= $e234 + (94 * (HByte-$f8)) + (LByte-$a1);
+              end
+              else
+              if (HByte>=$A1) and (HByte<=$A7) and (LByte>=$40) and (LByte<=$A0) then
+              begin
+                   code:= $e4C6 + (97 * (HByte-$A1)) + (LByte-$40);
+              end
+              else
+              begin
+                for i:=0 to length(cp936unodered)-1 do
+                begin
+                  if code1=cp936unodered[i] then
+                  begin
+                       code:=cp936unoderedstart+i;
+                       break;
+                  end;
+                end;
+              end;
+            end;
+          950:
+            begin
+                 if (HByte>=$81) and (HByte<=$8d) and (LByte>=$40) and (LByte<=$FE) then
+                 begin
+                      code:= $eeb8 + (157 * (Hbyte-$81)) ;
+                      if (Lbyte<$80) then
+                       code:=code + (Lbyte-$40)
+                      else
+                       code:=code + (Lbyte-$62);
+                 end
+                 else
+                 if (HByte>=$8e) and (HByte<=$a0) and (LByte>=$40) and (LByte<=$FE) then
+                 begin
+                   code:= $e311 + (157 * (Hbyte-$8e));
+                   if (Lbyte<$80) then
+                    code:=code + (Lbyte-$40)
+                   else
+                    code:=code + (Lbyte-$62);
+                 end
+                 else
+                 if (HByte>=$c6) and (HByte<=$c8) and (LByte>=$a1) and (LByte<=$FE) then
+                 begin
+                   code:= $f672 + (157 * (Hbyte-$c6)) ;
+                   if (Lbyte<$80) then
+                    code:=code + (Lbyte-$40)
+                   else
+                    code:=code + (Lbyte-$62);
+                 end
+                 else
+                 if (HByte>=$fa) and (HByte<=$fe) and (LByte>=$40) and (LByte<=$FE) then
+                 begin
+                   code:= $e000 + (157 * (Hbyte-$fa)) ;
+                   if (Lbyte<$80) then
+                    code:=code + (Lbyte-$40)
+                   else
+                    code:=code + (Lbyte-$62);
+                 end;
+            end;
+          949:
+            begin
+                 if (HByte=$c9) and (LByte>=$a1) and (LByte<=$fe) then
+                 begin
+                      code:= $e000 + (94 * (HByte-$c9)) + (LByte-$a1) ;
+                 end
+                 else
+                 if (HByte=$fe) and (LByte>=$a1) and (LByte<=$fe) then
+                 begin
+                      code:= $e05e + (94 * (HByte-$fe)) + (LByte-$a1) ;
+                 end
+                 else
+                 if code1=$ff then
+                 begin
+                      code:= $f8f7;
+                 end;
+            end;
+          932:
+            begin
+                 if (HByte>=$f0) and (HByte<=$f9) and (LByte>=$40) and (LByte<=$fc) then
+                 begin
+                      code:= $e000 + (188 * (HByte-$f0)) + (LByte-$40) ;
+                      if LByte>$7f then
+                         code:=code-1;
+                 end
+                 else
+                 begin
+                   case code1 of
+                     $00a0:code:=$f8f0;
+                     $00fd:code:=$f8f1;
+                     $00fe:code:=$f8f2;
+                     $00ff:code:=$f8f3;
+                   end;
+
+                 end;
+            end
+          else
+            code := 0;
+        end;
+      end;
       if code>0 then
       begin
         l:=UnicodeToUTF8Inline(code,Dest);
@@ -68,22 +186,22 @@
 
 function CP936ToUTF8(const s: string): string;
 begin
-  Result := DBCSToUTF8(s, Uni936C, CP936CC);
+  Result := DBCSToUTF8(s, Uni936C, CP936CC,936);
 end;
 
 function CP950ToUTF8(const s: string): string;
 begin
-  Result := DBCSToUTF8(s, Uni950C, CP950CC);
+  Result := DBCSToUTF8(s, Uni950C, CP950CC,950);
 end;
 
 function CP949ToUTF8(const s: string): string;
 begin
-  Result := DBCSToUTF8(s, Uni949C, CP949CC);
+  Result := DBCSToUTF8(s, Uni949C, CP949CC,949);
 end;
 
 function CP932ToUTF8(const s: string): string;
 begin
-  Result := DBCSToUTF8(s, Uni932C, CP932CC);
+  Result := DBCSToUTF8(s, Uni932C, CP932CC,932);
 end;
 
 {$IfNDef UseSystemCPConv}
@@ -241,4 +359,3 @@
   //SetLength(Result, Dest - PChar(Result));
   SetLength(Result, {%H-}PtrUInt(Dest) - PtrUInt(Result));
 end;
-
asiancodepagefunctions-2.diff (7,727 bytes)   

Juha Manninen

2020-12-16 12:30

developer   ~0127633

Thanks. The new patch compiles OK.
BTW, we only need a patch. There is no need to upload the whole source file.

I have asked feedback in forum and mailing list because I don't know how to test it. Let's see what other people say.

powerpcer

2020-12-16 13:45

reporter   ~0127635

as i known, European language also has EUDC to PUA mapping requirement. but i am lazy to make this.
here is my testing samples
造字cp936.txt (10 bytes)   
�����ƌz�h
造字cp936.txt (10 bytes)   
造字cp949.txt (10 bytes)   
��ɤɭ�C�Q
造字cp949.txt (10 bytes)   
造字cp950.txt (71 bytes)   
277999      -�B�Ӱ��@�@�@�@�@�@�@�@�@�@�@�@�@                          
造字cp950.txt (71 bytes)   
造字cp932.txt (12 bytes)   
������^����
造字cp932.txt (12 bytes)   

CudaText man_

2020-12-27 13:37

reporter   ~0127815

@powerpcer
I have the https://github.com/Alexey-T/EncConv
which was a fork from asiancodepages too. Can you give the patch there? I will compile CudaText with this patch and ppl can test the patch in CudaText.

jamie philbrook

2020-12-27 15:16

reporter   ~0127820

Isn't there some other way to manage this?

I think this is getting out of hand and ridiculous !

it would be nice if we could simply pick the code tables we want to support in the app and only load those tables.

Juha Manninen

2020-12-30 11:30

developer   ~0127930

I applied the patch. I don't know of anybody else has tested it but now it gets tested better.
Further restructuring is possible, too.

Issue History

Date Modified Username Field Change
2020-12-12 03:47 powerpcer New Issue
2020-12-12 09:12 Juha Manninen Note Added: 0127552
2020-12-12 14:19 Bart Broersma Note Added: 0127558
2020-12-14 09:25 powerpcer Note Added: 0127603
2020-12-14 09:25 powerpcer File Added: asiancodepagefunctions.inc
2020-12-14 13:50 Juha Manninen Additional Information Updated View Revisions
2020-12-14 13:50 Juha Manninen LazTarget => -
2020-12-14 13:51 Juha Manninen Note Added: 0127606
2020-12-14 14:51 Bart Broersma Note Added: 0127610
2020-12-14 14:51 Bart Broersma Status new => feedback
2020-12-14 14:53 Bart Broersma Note Edited: 0127610 View Revisions
2020-12-14 14:53 Bart Broersma Note Edited: 0127610 View Revisions
2020-12-14 23:26 powerpcer Note Added: 0127613
2020-12-14 23:26 powerpcer File Added: asiancodepagefunctions.diff
2020-12-14 23:26 powerpcer File Added: asiancodepagefunctions-2.inc
2020-12-14 23:26 powerpcer Status feedback => new
2020-12-15 19:10 Juha Manninen Assigned To => Juha Manninen
2020-12-15 19:10 Juha Manninen Status new => assigned
2020-12-15 19:25 Juha Manninen Note Added: 0127625
2020-12-15 19:26 Juha Manninen Status assigned => feedback
2020-12-16 00:20 powerpcer Note Added: 0127627
2020-12-16 00:20 powerpcer File Added: asiancodepagefunctions-3.inc
2020-12-16 00:20 powerpcer File Added: asiancodepagefunctions-2.diff
2020-12-16 00:20 powerpcer Status feedback => assigned
2020-12-16 12:30 Juha Manninen Note Added: 0127633
2020-12-16 13:45 powerpcer Note Added: 0127635
2020-12-16 13:45 powerpcer File Added: 造字cp936.txt
2020-12-16 13:45 powerpcer File Added: 造字cp949.txt
2020-12-16 13:45 powerpcer File Added: 造字cp950.txt
2020-12-16 13:45 powerpcer File Added: 造字cp932.txt
2020-12-27 13:37 CudaText man_ Note Added: 0127815
2020-12-27 15:16 jamie philbrook Note Added: 0127820
2020-12-30 11:30 Juha Manninen Status assigned => resolved
2020-12-30 11:30 Juha Manninen Resolution open => fixed
2020-12-30 11:30 Juha Manninen Fixed in Revision => r64308
2020-12-30 11:30 Juha Manninen Note Added: 0127930