View Issue Details

IDProjectCategoryView StatusLast Update
0030897FPCFCLpublic2016-11-11 18:04
ReporterwpAssigned ToMichael Van Canneyt 
PrioritynormalSeverityminorReproducibilityalways
Status closedResolutionfixed 
Product Version3.0.0Product Build 
Target Version3.2.0Fixed in Version3.1.1 
Summary0030897: csvreadwrite failing to read files with BOM correctly
DescriptionSince version 3.0 fpc contains the csvdocument library. There is an issue if the input file contains a BOM (byte-order-mark). The csv parser always begins it works at the beginning of the stream. If the stream begins with a BOM and if the first cell, for example, is a string representing a date then the conversion from string to date fails because the BOM adds unreckognized characters to the string.

The added patch reads the first few bytes of the input stream and checks whether they are identical with a UTF8, UTF16-LE or UTF16-BE BOM and continues parsing right after the BOM. The BOM found is stored as a read-only property.
Steps To ReproduceRun the attached demo. It reads a UTF8-file with BOM; the first cell contains a date string. Reading aborts here because the BOM is contained in the string to be converted to date by means of ScanDateTime.
TagsNo tags attached.
Fixed in Revision34871
FPCOldBugId
FPCTarget
Attached Files
  • csvdocument-bom.zip (1,446 bytes)
  • csvreadwrite.pp.patch (1,652 bytes)
    Index: fcl-base/src/csvreadwrite.pp
    ===================================================================
    --- fcl-base/src/csvreadwrite.pp	(revision 34850)
    +++ fcl-base/src/csvreadwrite.pp	(working copy)
    @@ -92,6 +92,8 @@
     
       { TCSVParser }
     
    +  TCSVByteOrderMark = (bomNone, bomUTF8, bomUTF16LE, bomUTF16BE);
    +
       TCSVParser = class(TCSVHandler)
       private
         FFreeStream: Boolean;
    @@ -98,6 +100,7 @@
         // fields
         FSourceStream: TStream;
         FStrStreamWrapper: TStringStream;
    +    FBOM: TCSVByteOrderMark;
         // parser state
         EndOfFile: Boolean;
         EndOfLine: Boolean;
    @@ -140,6 +143,8 @@
         property MaxColCount: Integer read FMaxColCount;
         // Does the parser own the stream ? If true, a previous stream is freed when set or when parser is destroyed.
         Property FreeStream : Boolean Read FFreeStream Write FFreeStream;
    +    // Return BOM found in file
    +    property BOM: TCSVByteOrderMark read FBOM;
       end;
     
       // Sequential output to CSV stream
    @@ -443,9 +448,30 @@
     end;
     
     procedure TCSVParser.ResetParser;
    +var
    +  s: String;
    +  n: Integer;
     begin
       ClearOutput;
       FSourceStream.Seek(0, soFromBeginning);
    +  SetLengtH(s, 4);
    +  FSourceStream.ReadBuffer(s[1], 4);
    +  if (s[1] = #$EF) and (s[2] = #$BB) and (s[3] = #$BF) then begin
    +    FBOM := bomUTF8;
    +    n := 3;
    +  end else
    +  if (s[1] = #$FE) and (s[2] = #$FF) then begin
    +    FBOM := bomUTF16BE;
    +    n := 2;
    +  end else
    +  if (s[1] = #$FF) and (s[2] = #$FE) then begin
    +    FBOM := bomUTF16LE;
    +    n := 2;
    +  end else begin
    +    FBOM := bomNone;
    +    n := 0;
    +  end;
    +  FSourceStream.Seek(n, soFromBeginning);
       EndOfFile := False;
       NextChar;
     end;
    
    csvreadwrite.pp.patch (1,652 bytes)
  • csvreadwrite.pp.v2.patch (1,854 bytes)
    Index: fcl-base/src/csvreadwrite.pp
    ===================================================================
    --- fcl-base/src/csvreadwrite.pp	(revision 34850)
    +++ fcl-base/src/csvreadwrite.pp	(working copy)
    @@ -92,6 +92,8 @@
     
       { TCSVParser }
     
    +  TCSVByteOrderMark = (bomNone, bomUTF8, bomUTF16LE, bomUTF16BE);
    +
       TCSVParser = class(TCSVHandler)
       private
         FFreeStream: Boolean;
    @@ -98,6 +100,8 @@
         // fields
         FSourceStream: TStream;
         FStrStreamWrapper: TStringStream;
    +    FBOM: TCSVByteOrderMark;
    +    FSkipBOM: Boolean;
         // parser state
         EndOfFile: Boolean;
         EndOfLine: Boolean;
    @@ -140,6 +144,10 @@
         property MaxColCount: Integer read FMaxColCount;
         // Does the parser own the stream ? If true, a previous stream is freed when set or when parser is destroyed.
         Property FreeStream : Boolean Read FFreeStream Write FFreeStream;
    +    // Return BOM found in file
    +    property BOM: TCSVByteOrderMark read FBOM;
    +    // Optionally skip BOM detection
    +    property SkipBOM: Boolean read FSkipBOM write FSkipBOM default false;
       end;
     
       // Sequential output to CSV stream
    @@ -443,9 +451,32 @@
     end;
     
     procedure TCSVParser.ResetParser;
    +var
    +  b: packed array[0..2] of byte;
    +  n: Integer;
     begin
       ClearOutput;
       FSourceStream.Seek(0, soFromBeginning);
    +  if FSkipBOM then
    +  begin
    +    FSourceStream.ReadBuffer(b[0], 3);
    +    if (b[0] = $EF) and (b[1] = $BB) and (b[2] = $BF) then begin
    +      FBOM := bomUTF8;
    +      n := 3;
    +    end else
    +    if (b[0] = $FE) and (b[1] = $FF) then begin
    +      FBOM := bomUTF16BE;
    +      n := 2;
    +    end else
    +    if (b[0] = $FF) and (b[1] = $FE) then begin
    +      FBOM := bomUTF16LE;
    +      n := 2;
    +    end else begin
    +      FBOM := bomNone;
    +      n := 0;
    +    end;
    +    FSourceStream.Seek(n, soFromBeginning);
    +  end;
       EndOfFile := False;
       NextChar;
     end;
    
    csvreadwrite.pp.v2.patch (1,854 bytes)
  • csvdocument-bom-v2.zip (1,453 bytes)

Activities

wp

2016-11-09 15:32

reporter  

csvdocument-bom.zip (1,446 bytes)

wp

2016-11-09 15:33

reporter  

csvreadwrite.pp.patch (1,652 bytes)
Index: fcl-base/src/csvreadwrite.pp
===================================================================
--- fcl-base/src/csvreadwrite.pp	(revision 34850)
+++ fcl-base/src/csvreadwrite.pp	(working copy)
@@ -92,6 +92,8 @@
 
   { TCSVParser }
 
+  TCSVByteOrderMark = (bomNone, bomUTF8, bomUTF16LE, bomUTF16BE);
+
   TCSVParser = class(TCSVHandler)
   private
     FFreeStream: Boolean;
@@ -98,6 +100,7 @@
     // fields
     FSourceStream: TStream;
     FStrStreamWrapper: TStringStream;
+    FBOM: TCSVByteOrderMark;
     // parser state
     EndOfFile: Boolean;
     EndOfLine: Boolean;
@@ -140,6 +143,8 @@
     property MaxColCount: Integer read FMaxColCount;
     // Does the parser own the stream ? If true, a previous stream is freed when set or when parser is destroyed.
     Property FreeStream : Boolean Read FFreeStream Write FFreeStream;
+    // Return BOM found in file
+    property BOM: TCSVByteOrderMark read FBOM;
   end;
 
   // Sequential output to CSV stream
@@ -443,9 +448,30 @@
 end;
 
 procedure TCSVParser.ResetParser;
+var
+  s: String;
+  n: Integer;
 begin
   ClearOutput;
   FSourceStream.Seek(0, soFromBeginning);
+  SetLengtH(s, 4);
+  FSourceStream.ReadBuffer(s[1], 4);
+  if (s[1] = #$EF) and (s[2] = #$BB) and (s[3] = #$BF) then begin
+    FBOM := bomUTF8;
+    n := 3;
+  end else
+  if (s[1] = #$FE) and (s[2] = #$FF) then begin
+    FBOM := bomUTF16BE;
+    n := 2;
+  end else
+  if (s[1] = #$FF) and (s[2] = #$FE) then begin
+    FBOM := bomUTF16LE;
+    n := 2;
+  end else begin
+    FBOM := bomNone;
+    n := 0;
+  end;
+  FSourceStream.Seek(n, soFromBeginning);
   EndOfFile := False;
   NextChar;
 end;
csvreadwrite.pp.patch (1,652 bytes)

Thaddy de Koning

2016-11-09 17:30

reporter   ~0095712

Last edited: 2016-11-09 17:35

View 2 revisions

The patch is not correct. It assumes AnsiString as string type.
The issue is correct.

wp

2016-11-09 17:55

reporter   ~0095714

Thanks, you are right.

The new patch (v2) uses an array of byte for reading the first few bytes of the stream.

I also added a new boolean property "SkipBOM" defaulting to false. This turns off the new BOM detection to prevent broken code for those users who had skipped the BOM manually in their application (after calling parser.SetSource). The new test project (v2) sets this property, but I commented this line to show the error first.

I would be tempted to default "SkipBOM" to true because this would be the practical use case. I don't know how to balance "ease of use for many" vs. "code-breakage for few". Michael, feel free to change the default or remove SkipBOM completely.

wp

2016-11-09 17:55

reporter  

csvreadwrite.pp.v2.patch (1,854 bytes)
Index: fcl-base/src/csvreadwrite.pp
===================================================================
--- fcl-base/src/csvreadwrite.pp	(revision 34850)
+++ fcl-base/src/csvreadwrite.pp	(working copy)
@@ -92,6 +92,8 @@
 
   { TCSVParser }
 
+  TCSVByteOrderMark = (bomNone, bomUTF8, bomUTF16LE, bomUTF16BE);
+
   TCSVParser = class(TCSVHandler)
   private
     FFreeStream: Boolean;
@@ -98,6 +100,8 @@
     // fields
     FSourceStream: TStream;
     FStrStreamWrapper: TStringStream;
+    FBOM: TCSVByteOrderMark;
+    FSkipBOM: Boolean;
     // parser state
     EndOfFile: Boolean;
     EndOfLine: Boolean;
@@ -140,6 +144,10 @@
     property MaxColCount: Integer read FMaxColCount;
     // Does the parser own the stream ? If true, a previous stream is freed when set or when parser is destroyed.
     Property FreeStream : Boolean Read FFreeStream Write FFreeStream;
+    // Return BOM found in file
+    property BOM: TCSVByteOrderMark read FBOM;
+    // Optionally skip BOM detection
+    property SkipBOM: Boolean read FSkipBOM write FSkipBOM default false;
   end;
 
   // Sequential output to CSV stream
@@ -443,9 +451,32 @@
 end;
 
 procedure TCSVParser.ResetParser;
+var
+  b: packed array[0..2] of byte;
+  n: Integer;
 begin
   ClearOutput;
   FSourceStream.Seek(0, soFromBeginning);
+  if FSkipBOM then
+  begin
+    FSourceStream.ReadBuffer(b[0], 3);
+    if (b[0] = $EF) and (b[1] = $BB) and (b[2] = $BF) then begin
+      FBOM := bomUTF8;
+      n := 3;
+    end else
+    if (b[0] = $FE) and (b[1] = $FF) then begin
+      FBOM := bomUTF16BE;
+      n := 2;
+    end else
+    if (b[0] = $FF) and (b[1] = $FE) then begin
+      FBOM := bomUTF16LE;
+      n := 2;
+    end else begin
+      FBOM := bomNone;
+      n := 0;
+    end;
+    FSourceStream.Seek(n, soFromBeginning);
+  end;
   EndOfFile := False;
   NextChar;
 end;
csvreadwrite.pp.v2.patch (1,854 bytes)

wp

2016-11-09 17:56

reporter  

csvdocument-bom-v2.zip (1,453 bytes)

Michael Van Canneyt

2016-11-11 11:05

administrator   ~0095755

Applied, but changed the name of SkipBOM to DetectBOM.
I find this more clear.

I left the default to False.
Reasoning: detecting the BOM does a Seek() and not all streams support this.

wp

2016-11-11 18:04

reporter   ~0095775

Thank you, perfect!

Issue History

Date Modified Username Field Change
2016-11-09 15:32 wp New Issue
2016-11-09 15:32 wp File Added: csvdocument-bom.zip
2016-11-09 15:33 wp File Added: csvreadwrite.pp.patch
2016-11-09 17:28 Michael Van Canneyt Assigned To => Michael Van Canneyt
2016-11-09 17:28 Michael Van Canneyt Status new => assigned
2016-11-09 17:30 Thaddy de Koning Note Added: 0095712
2016-11-09 17:35 Thaddy de Koning Note Edited: 0095712 View Revisions
2016-11-09 17:55 wp Note Added: 0095714
2016-11-09 17:55 wp File Added: csvreadwrite.pp.v2.patch
2016-11-09 17:56 wp File Added: csvdocument-bom-v2.zip
2016-11-11 11:05 Michael Van Canneyt Fixed in Revision => 34871
2016-11-11 11:05 Michael Van Canneyt Note Added: 0095755
2016-11-11 11:05 Michael Van Canneyt Status assigned => resolved
2016-11-11 11:05 Michael Van Canneyt Fixed in Version => 3.1.1
2016-11-11 11:05 Michael Van Canneyt Resolution open => fixed
2016-11-11 11:05 Michael Van Canneyt Target Version => 3.2.0
2016-11-11 18:04 wp Note Added: 0095775
2016-11-11 18:04 wp Status resolved => closed