Unicode search

**mijack**

I used this some times ago to find a keyword in some HTML Files
But when i modify it to search for a Unicode keyword ( Word ) it gives an empty result is there any wrong issue with it . ( i use TntSysUtils units )

zusammenfalten · markieren

Delphi-Quellcode:

			function ScanFile(const filename: String;

                 const forString: String;

                 caseSensitive: Boolean ): LongInt;

{ returns position of string in file or -1, if not found }

const

 BufferSize= $8001;  { 32K+1 bytes }

var

 pBuf, pEnd, pScan, pPos : PWidechar;

 filesize: LongInt;

 bytesRemaining: LongInt;

 bytesToRead: Integer;

 F   : File;

 SearchFor: PWidechar;

 oldMode: Word;

begin

 Result := -1;  { assume failure }

 if (Length( forString ) = 0) or (Length( filename ) = 0) then

   Exit;

 SearchFor := nil;

 pBuf      := nil;

 { open file as binary, 1 byte recordsize }

 AssignFile( F, filename );

 oldMode := FileMode;

 FileMode := 0;    { read-only access }

 Reset( F, 1 );

 FileMode := oldMode;

 try { allocate memory for buffer and pchar search string }

   SearchFor := StrAllocW( Length( forString )+1 );

   StrPCopyW( SearchFor, forString );

  if not caseSensitive then  { convert to upper case }

  Tnt_WideUpperCase(SearchFor ); // 

    // AnsiUpperCase( SearchFor );

   GetMem( pBuf, BufferSize );

   filesize := System.Filesize( F );

   bytesRemaining := filesize;

   pPos := nil;

   while bytesRemaining > 0 do

   begin

     { calc how many bytes to read this round }

     if bytesRemaining >= BufferSize then

       bytesToRead := Pred( BufferSize )

     else

       bytesToRead := bytesRemaining;

     { read a buffer full and zero-terminate the buffer }

     BlockRead(F, pBuf^, bytesToRead, bytesToRead);

     pEnd := @pBuf[ bytesToRead ];

     pEnd^:= #0;

     { scan the buffer. Problem: buffer may contain #0 chars! So we

       treat it as a concatenation of zero-terminated strings. }

     pScan := pBuf;

     while pScan < pEnd do

     begin

      if not caseSensitive then { convert to upper case }

        Tnt_WideUpperCase( pScan );

       pPos := StrPosW( pScan, SearchFor );  { search for substring }

       if pPos <> nil then

       begin { Found it! }

         Result := FileSize - bytesRemaining +

                   LongInt( pPos ) - LongInt( pBuf );

         Break;

       end;

       pScan := StrEndW( pScan );

       Inc( pScan );

     end;

     if pPos <> nil then

       Break;

     bytesRemaining := bytesRemaining - bytesToRead;

     if bytesRemaining > 0 then

     begin

     { no luck in this buffers load. We need to handle the case of

       the search string spanning two chunks of file now. We simply

       go back a bit in the file and read from there, thus inspecting

       some characters twice

     }

       Seek( F, FilePos(F)-Length( forString ));

       bytesRemaining := bytesRemaining + Length( forString );

     end;

   end; { While }

 finally

   CloseFile( F );

   If SearchFor <> nil then

     StrDisposeW( SearchFor );

   If pBuf <> nil then

     FreeMem( pBuf, BufferSize );

 end;

end; { ScanFile }

procedure GetFileList( FileList: TStringList; inDir, Extension : String );

 procedure ProcessSearchRec( aSearchRec : TSearchRecW );

 var

  sDate: String;

 begin

   if ( aSearchRec.Attr and faDirectory ) <> 0 then

   begin

     if ( aSearchRec.Name <> '.' ) and

        ( aSearchRec.Name <> '..' ) then

     begin

       GetFileList( FileList, Extension, InDir + '\' + aSearchRec.Name );

     end;

   end

   else

   begin

     sDate := DateTimeToStr(FileDateToDateTime(aSearchRec.Time));

     FileList.Add(inDir + '\' + aSearchRec.Name);

   end;

 end;

var CurDir : String;

 aSearchRec : TSearchRecW;

begin

 CurDir := inDir + '\*.' + Extension;

 if WideFindFirst( CurDir, faAnyFile, aSearchRec ) = 0 then

 begin

   ProcessSearchRec( aSearchRec );

   while WideFindNext( aSearchRec ) = 0 do

     ProcessSearchRec( aSearchRec );

 end;

 WideFindClose(aSearchRec);

end;

procedure TForm1.GetHTMLFileList(Directory, SearchString: WideString;

  CaseSens: Boolean);

var

 FL: TStringList;

begin

 FL := TStringList.Create;

 FL.Sorted := True;

 GetFileList(FL, Directory, 'HTM*');

 ProcessHTMLFIles(FL, SearchString, CaseSens);

 FL.Free;

end;

procedure TForm1.ProcessHTMLFiles(FileList: TStringList;

  SearchString: WideString; CaseSens: Boolean);

var

 i: Integer;

begin

 for i := 0 to Pred(FileList.Count) do

 begin

   if ScanFile(FileList.Strings[i], SearchString, CaseSens) > 0 then

   begin

     // The result was found 

     Memo1.Lines.Add(FileList.Strings[i]); // a memo is TntMemo 

   end;

 end;

end;

many thanks

**mijack**

please is there any help .

**mijack**

I have attached an exemple of the code i use , could someone tell me where is the error and why it doesn't give any result with Unicode Html File .

**himitsu**

Why do you use so many ansi-function in spite of searching for a wide-string?

AnsiUpperCase is a funcion and not a procedure.
forString := WideUpperCase( forString ); it have not been tested still, however, I think it functions nevertheless.

zusammenfalten · markieren

Delphi-Quellcode:

			function ScanFile(const filename: String;

                 forString: WideString;

                 caseSensitive: Boolean ): LongInt;

{ returns position of string in file or -1, if not found }

const

 BufferSize= $8000;  { 32K bytes }

var

 Buf: WideString;

 filesize, bytesRemaining, bytesToRead, bytesReaded, i: Integer;

 F: File of Widechar;

 oldMode: Word;

begin

 Result := -1;  { assume failure }

 if (Length( forString ) = 0) or (Length( filename ) = 0) then

   Exit;

 { open file as binary, 1 byte recordsize }

 AssignFile( F, filename );

 oldMode := FileMode;

 FileMode := 0;    { read-only access }

 Reset(F);//, 1 );

 FileMode := oldMode;

 try

   if not caseSensitive then  { convert to upper case }

     forString := WideUpperCase( forString );

   filesize := System.Filesize( F ) and not 1;

   bytesRemaining := filesize;

   Buf := '';

   while bytesRemaining > 0 do

   begin

     { calc how many bytes to read this round }

     if bytesRemaining > BufferSize then

       bytesToRead := BufferSize

     else

       bytesToRead := bytesRemaining;

     { delete the buffer, up to a part for the buffer overall search }

     Delete(Buf, 1, Length(Buf) - Length(forString) + 1);

     i := Length(Buf);

     { read a buffer }

     SetLength(Buf, i + bytesToRead);

     BlockRead(F, Buf[i + 1], bytesToRead, bytesReaded);

     if bytesToRead <> bytesReaded then

       Exit; { read error }

     if not caseSensitive then  { convert to upper case }

       Buf := WideUpperCase( Buf );

     { scan the buffer }

     i := Pos(forString, Buf);

     if i > 0 then

     begin

       Result := FileSize - bytesRemaining - Length(Buf) + i;

       Exit;

     end;

     Dec(bytesRemaining, bytesToRead);

   end; { While }

 finally

   CloseFile( F );

 end;

end; { ScanFile }

Unicode Html File

Are you secure with the format?
Not that the files are UTF-8 coded for example.

**mijack**

Thank you himitsu but that doesn't give any result , could you please correct the exemple

Note : StrPCopyW and StrAllocW are from Unicode20 of Mike , i will attache it here

**Sir Rufo**

Zitat von himitsu:

BlockRead(F, Buf[i + 1], bytesToRead, bytesReaded);

Maybe thats the problem, because Unicode (UTF-8) has variable length per char from 1 up to 4 Bytes.
Didn't test it - so maybe ...

Zitat von himitsu:

Unicode Html File

Are you secure with the format?
Not that the files are UTF-8 coded for example.

But UTF-8 is Unicode ->

Wikipedia UTF-8
Unicode Transformation Format

greetings

Oliver

**himitsu**

UTF-8 is a transcodet format that constains Unicode-Data.
Unicode has 2 bytes per char.

I already found a mistake with myself:

markieren

Code:

			[color=#0000ff]{ read a buffer }[/color]

SetLength(Buf, i + [color=#ff0000](bytesToRead div 2)[/color]);

but:

Zitat:

{ open file as binary, 1 byte recordsize }

However, there the file not binary, but in WideChar steps is processed, the statement was not right.

[add]
I re-worked the function/demo and still a UTF8 version with inserted.

Unicode search

Unicode search

Re: Unicode search

Re: Unicode search

Re: Unicode search

Re: Unicode search

Re: Unicode search

Re: Unicode search

Forumregeln

mijack Registriert seit: 18. Mai 2007 11 Beiträge Delphi 2007 Enterprise	#2 Re: Unicode search 24. Mai 2007, 18:29 please is there any help .
	Zitat