F0-F7 + 80-BF + 80-BF + 80-BF // 4 Bytes = 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
此类文件里可能有Ascii字符,也可能有GB2312/GBK/GB18030/Big5的中文字符。单纯从文件上来看,它与没有BOM头的UTF-8编码文件类似,所以必须考虑如何与UTF-8文件区分开来。
type
TCharEncoding = (ceAnsi, ceBinary, ceUtf_8, ceUcs2_LE, ceUcs2_BE, ceUtf_32, ceGB, ceBig5);
function GuessCharEncoding(const buf: string; SeeGBBig5: Boolean): TCharEncoding;
var
len: Longint;
function Maybe3BytesUtf8(Index: Integer): Boolean;
begin
Result := (Index + 2 <= len) and (buf[Index] in [#$E0..#$EF]) and
(buf[Index +1] in [#$80..#$BF]) and (buf[Index +2] in [#$80..#$BF]);
end;
var
idx: Longint;
iUtf8: Longint;
maybeGB: Integer; //GB2312/GBK/GB18030
mayBig5: Integer; //Big5
mayUtf8: Integer; //Utf-8
maybeLE: Integer; //Unicode 16 (UCS2) , Little Endian
maybeBE: Integer; //Unicode 16 (UCS2) , Big Endian
mayBins: Integer; //Binary File Chars ?
serZero: Integer;
ratio: Integer;
chsCount: Integer;
utf8Count: Integer;
gbkNulls: Integer;
big5Nulls: Integer;
begin
Result := ceAnsi;
maybeGB := 0;
mayBig5 := 0;
mayUtf8 := 0;
maybeLE := 0;
maybeBE := 0;
mayBins := 0;
serZero := 0;
chsCount := 0;
utf8Count := 0;
gbkNulls := 0;
big5Nulls := 0;
len := Length(buf);
idx := 1;
while idx <= len do begin
if idx < len then begin
if (buf[idx] in [#$A1..#$A7]) and (buf[idx +1] in [#$40..#$A0]) or
(buf[idx] in [#$AA..#$AF, #$F8..#$FE]) and (buf[idx +1] in [#$A1..#$FE]) then begin
Inc(gbkNulls);
end;
if (buf[idx +1] in [#$7F..#$A0]) or
(buf[idx] in [#$C7, #$C8]) and (buf[idx +1] in [#$40..#$FE]) or
(buf[idx] = #$C6) and (buf[idx +1] in [#$A1..#$FE]) then begin
Inc(big5Nulls);
end;
end;
case buf[idx] of
#0: begin
Inc(mayBins);
if (idx < len) and (buf[idx +1] = #0) then begin
Inc(serZero);
end;
if (idx mod 2) = 0 then begin
Inc(maybeLE);
end else begin
Inc(maybeBE);
end;
end;
#1..#8, #11, #12, #14..#31: begin
Inc(mayBins);
end;
#$80: begin
iUtf8 := idx;
Inc(iUtf8);
if (iUtf8 < len) and (buf[iUtf8] in [#$80..#$BF]) then Inc(iUtf8);
if Maybe3BytesUtf8(iUtf8) then begin
Inc(mayUtf8, 32);
Inc(utf8Count);
end;
end;
#$81..#$BF: begin
if buf[idx] in [#$81..#$A0] then begin
Inc(maybeGB, 8);
end else begin
Inc(maybeGB, 8);
Inc(mayBig5, 8);
end;
Inc(chsCount);
Inc(idx);
iUtf8 := idx;
if (iUtf8 < len) and (buf[iUtf8] in [#$80..#$BF]) then Inc(iUtf8);
if Maybe3BytesUtf8(iUtf8) then begin
Inc(mayUtf8, 32);
Inc(utf8Count);
end;
end;
#$C0..#$DF: begin
if (idx < len) and (buf[idx +1] in [#$80..#$BF]) then begin
Inc(mayUtf8);
Inc(utf8Count);
Inc(chsCount);
if (buf[idx +1] in [#$A1..#$BF]) then begin
Inc(maybeGB);
Inc(mayBig5);
end else begin
Inc(maybeGB, 4);
end;
end else begin
Inc(maybeGB);
Inc(mayBig5);
Inc(chsCount);
end;
Inc(idx);
end;
#$E0..#$EF: begin
if (idx + 2 <= len) and (buf[idx +1] in [#$80..#$BF]) and (buf[idx +2] in [#$80..#$BF]) then begin
Inc(mayUtf8, 32);
Inc(utf8Count);
end;
Inc(maybeGB);
Inc(mayBig5);
Inc(chsCount);
Inc(idx);
end;
#$F0..#$FE: begin
if buf[idx] in [#$FA..#$FE] then begin
Inc(maybeGB, 8);
end;
Inc(maybeGB, 8);
Inc(mayBig5, 8);
Inc(chsCount);
Inc(idx);
end;
end;
Inc(idx);
end;
// set encoding
if (mayBins > 1) or (maybeLE > 1) or (maybeBE > 1) or (mayBins * 8 >= len) or (maybeLE * 8 >= len) or (maybeBE * 8 >= len) then begin
if (mayBins > maybeLE *2) and (mayBins > maybeBE *2) or (serZero > 10) or
(serZero * 8 > Max(maybeLE, maybeBE)) then begin
Result := ceBinary;
end else if maybeLE >= maybeBE then begin
Result := ceUcs2_LE;
end else begin
Result := ceUcs2_BE;
end;
end else if (maybeGB >= mayUtf8) or (mayBig5 >= mayUtf8) or (chsCount >= utf8Count *2) then begin
ratio := (maybeGB - mayBig5) * 100 div Max(1, Max(maybeGB, mayBig5));
if Abs(ratio) <= 5 then begin
if gbkNulls > big5Nulls then begin
Result := ceBig5;
end else if gbkNulls < big5Nulls then begin
Result := ceGB;
end else if SeeGBBig5 and ((maybeGB > 0) or (mayBig5 > 0)) then begin
Result := TryToDistinguishGBOrBig5(Copy(buf, 1, len));
end;
end else begin
if ratio > 0 then begin
Result := ceGB;
end else begin
Result := ceBig5;
end;
end;
end else if mayUtf8 > 0 then begin
Result := ceUtf_8;
end;
end;
function GuessCharEncoding(AStream: TStream; SeeGBBig5: Boolean): TCharEncoding;
var
buf: string;
begin
SetLength(buf, Min(SamplingSize, AStream.Size - AStream.Position));
AStream.Read(buf[1], Length(buf));
Result := GuessCharEncoding(buf, SeeGBBig5);
end;