Es gibt noch das hier aber wie man das anwendet ist mir ein Rätsel.
http://chsdet.sourceforge.net/
Sonst habe ich noch das hier gefunden, bin aber gerade unfähig eine
ansi-Datei zu erstellen. Daher bekomme ich bei jeder Datei true zurück
Delphi-Quellcode:
function FileMayBeUTF8(FileName: WideString): Boolean;
var
Stream: TMemoryStream;
BytesRead: Integer;
ArrayBuff:
array [0 .. 127]
of Byte;
PreviousByte: Byte;
i: Integer;
YesSequences, NoSequences: Integer;
begin
if not FileExists(FileName)
then
Exit;
YesSequences := 0;
NoSequences := 0;
Stream := TMemoryStream.Create;
try
Stream.LoadFromFile(FileName);
repeat
{read from the TMemoryStream}
BytesRead := Stream.
Read(ArrayBuff, High(ArrayBuff) + 1);
{Do the work on the bytes in the buffer}
if BytesRead > 1
then
begin
for i := 1
to BytesRead - 1
do
begin
PreviousByte := ArrayBuff[i - 1];
if ((ArrayBuff[i]
and $C0) = $80)
then
begin
if ((PreviousByte
and $C0) = $C0)
then
begin
inc(YesSequences)
end
else
begin
if ((PreviousByte
and $80) = $0)
then
inc(NoSequences);
end;
end;
end;
end;
until (BytesRead < (High(ArrayBuff) + 1));
// Below, >= makes ASCII files = UTF-8, which is no problem.
// Simple > would catch only UTF-8;
Result := (YesSequences >= NoSequences);
finally
Stream.Free;
end;
end;
Hier noch eine schöne Version. Gibt aber leider bei
ANSI auch true zurück
Delphi-Quellcode:
function UTF8CharLength(
const c: Byte): Integer;
begin
// First Byte: 0xxxxxxx
if ((c
and $80) = $00)
then
begin
Result := 1;
end
// First Byte: 110yyyyy
else if ((c
and $E0) = $C0)
then
begin
Result := 2;
end
// First Byte: 1110zzzz
else if ((c
and $F0) = $E0)
then
begin
Result := 3;
end
// First Byte: 11110uuu
else if ((c
and $F8) = $F0)
then
begin
Result := 4;
end
// not valid, return the error value
else
begin
Result := -1;
end;
end;
function UTF8IsTrailChar(
const c: Byte): Boolean;
begin
// trail bytes have this form: 10xxxxxx
Result := ((c
and $C0) = $80);
end;
function IsUTF8Memory(AMem: PBYTE; ASize: Int64): Boolean;
var
i: Int64;
c: Integer;
begin
Result := True;
i := 0;
while (i < ASize)
do
begin
// get the length if the current UTF-8 character
c := UTF8CharLength(AMem^);
// check if it is valid and fits into ASize
if ((c >= 1)
and (c <= 4)
and ((i + c - 1) < ASize))
then
begin
Inc(i, c);
Inc(AMem);
// if it is a multi-byte character, check the trail bytes
while (c > 1)
do
begin
if (
not UTF8IsTrailChar(AMem^))
then
begin
Result := False;
Break;
end
else
begin
Dec(c);
Inc(AMem);
end;
end;
end
else
begin
Result := False;
end;
if (
not Result)
then
Break;
end;
end;