![]() |
Unicode search
I used this some times ago to find a keyword in some HTML Files
But when i modify it to search for a Unicode keyword ( Word ) it gives an empty result is there any wrong issue with it . ( i use TntSysUtils units )
Delphi-Quellcode:
many thanks
function ScanFile(const filename: String;
const forString: String; caseSensitive: Boolean ): LongInt; { returns position of string in file or -1, if not found } const BufferSize= $8001; { 32K+1 bytes } var pBuf, pEnd, pScan, pPos : PWidechar; filesize: LongInt; bytesRemaining: LongInt; bytesToRead: Integer; F : File; SearchFor: PWidechar; oldMode: Word; begin Result := -1; { assume failure } if (Length( forString ) = 0) or (Length( filename ) = 0) then Exit; SearchFor := nil; pBuf := nil; { open file as binary, 1 byte recordsize } AssignFile( F, filename ); oldMode := FileMode; FileMode := 0; { read-only access } Reset( F, 1 ); FileMode := oldMode; try { allocate memory for buffer and pchar search string } SearchFor := StrAllocW( Length( forString )+1 ); StrPCopyW( SearchFor, forString ); if not caseSensitive then { convert to upper case } Tnt_WideUpperCase(SearchFor ); // // AnsiUpperCase( SearchFor ); GetMem( pBuf, BufferSize ); filesize := System.Filesize( F ); bytesRemaining := filesize; pPos := nil; while bytesRemaining > 0 do begin { calc how many bytes to read this round } if bytesRemaining >= BufferSize then bytesToRead := Pred( BufferSize ) else bytesToRead := bytesRemaining; { read a buffer full and zero-terminate the buffer } BlockRead(F, pBuf^, bytesToRead, bytesToRead); pEnd := @pBuf[ bytesToRead ]; pEnd^:= #0; { scan the buffer. Problem: buffer may contain #0 chars! So we treat it as a concatenation of zero-terminated strings. } pScan := pBuf; while pScan < pEnd do begin if not caseSensitive then { convert to upper case } Tnt_WideUpperCase( pScan ); pPos := StrPosW( pScan, SearchFor ); { search for substring } if pPos <> nil then begin { Found it! } Result := FileSize - bytesRemaining + LongInt( pPos ) - LongInt( pBuf ); Break; end; pScan := StrEndW( pScan ); Inc( pScan ); end; if pPos <> nil then Break; bytesRemaining := bytesRemaining - bytesToRead; if bytesRemaining > 0 then begin { no luck in this buffers load. We need to handle the case of the search string spanning two chunks of file now. We simply go back a bit in the file and read from there, thus inspecting some characters twice } Seek( F, FilePos(F)-Length( forString )); bytesRemaining := bytesRemaining + Length( forString ); end; end; { While } finally CloseFile( F ); If SearchFor <> nil then StrDisposeW( SearchFor ); If pBuf <> nil then FreeMem( pBuf, BufferSize ); end; end; { ScanFile } procedure GetFileList( FileList: TStringList; inDir, Extension : String ); procedure ProcessSearchRec( aSearchRec : TSearchRecW ); var sDate: String; begin if ( aSearchRec.Attr and faDirectory ) <> 0 then begin if ( aSearchRec.Name <> '.' ) and ( aSearchRec.Name <> '..' ) then begin GetFileList( FileList, Extension, InDir + '\' + aSearchRec.Name ); end; end else begin sDate := DateTimeToStr(FileDateToDateTime(aSearchRec.Time)); FileList.Add(inDir + '\' + aSearchRec.Name); end; end; var CurDir : String; aSearchRec : TSearchRecW; begin CurDir := inDir + '\*.' + Extension; if WideFindFirst( CurDir, faAnyFile, aSearchRec ) = 0 then begin ProcessSearchRec( aSearchRec ); while WideFindNext( aSearchRec ) = 0 do ProcessSearchRec( aSearchRec ); end; WideFindClose(aSearchRec); end; procedure TForm1.GetHTMLFileList(Directory, SearchString: WideString; CaseSens: Boolean); var FL: TStringList; begin FL := TStringList.Create; FL.Sorted := True; GetFileList(FL, Directory, 'HTM*'); ProcessHTMLFIles(FL, SearchString, CaseSens); FL.Free; end; procedure TForm1.ProcessHTMLFiles(FileList: TStringList; SearchString: WideString; CaseSens: Boolean); var i: Integer; begin for i := 0 to Pred(FileList.Count) do begin if ScanFile(FileList.Strings[i], SearchString, CaseSens) > 0 then begin // The result was found Memo1.Lines.Add(FileList.Strings[i]); // a memo is TntMemo end; end; end; |
Re: Unicode search
please is there any help .
|
Re: Unicode search
Liste der Anhänge anzeigen (Anzahl: 1)
I have attached an exemple of the code i use , could someone tell me where is the error and why it doesn't give any result with Unicode Html File .
|
Re: Unicode search
Why do you use so many ansi-function in spite of searching for a wide-string?
AnsiUpperCase is a funcion and not a procedure.
Delphi-Quellcode:
it have not been tested still, however, I think it functions nevertheless.
forString := WideUpperCase( forString );
Delphi-Quellcode:
function ScanFile(const filename: String;
forString: WideString; caseSensitive: Boolean ): LongInt; { returns position of string in file or -1, if not found } const BufferSize= $8000; { 32K bytes } var Buf: WideString; filesize, bytesRemaining, bytesToRead, bytesReaded, i: Integer; F: File of Widechar; oldMode: Word; begin Result := -1; { assume failure } if (Length( forString ) = 0) or (Length( filename ) = 0) then Exit; { open file as binary, 1 byte recordsize } AssignFile( F, filename ); oldMode := FileMode; FileMode := 0; { read-only access } Reset(F);//, 1 ); FileMode := oldMode; try if not caseSensitive then { convert to upper case } forString := WideUpperCase( forString ); filesize := System.Filesize( F ) and not 1; bytesRemaining := filesize; Buf := ''; while bytesRemaining > 0 do begin { calc how many bytes to read this round } if bytesRemaining > BufferSize then bytesToRead := BufferSize else bytesToRead := bytesRemaining; { delete the buffer, up to a part for the buffer overall search } Delete(Buf, 1, Length(Buf) - Length(forString) + 1); i := Length(Buf); { read a buffer } SetLength(Buf, i + bytesToRead); BlockRead(F, Buf[i + 1], bytesToRead, bytesReaded); if bytesToRead <> bytesReaded then Exit; { read error } if not caseSensitive then { convert to upper case } Buf := WideUpperCase( Buf ); { scan the buffer } i := Pos(forString, Buf); if i > 0 then begin Result := FileSize - bytesRemaining - Length(Buf) + i; Exit; end; Dec(bytesRemaining, bytesToRead); end; { While } finally CloseFile( F ); end; end; { ScanFile } Unicode Html File :gruebel: Are you secure with the format? Not that the files are UTF-8 coded for example. |
Re: Unicode search
Liste der Anhänge anzeigen (Anzahl: 1)
Thank you himitsu but that doesn't give any result , could you please correct the exemple
Note : StrPCopyW and StrAllocW are from Unicode20 of Mike , i will attache it here |
Re: Unicode search
Zitat:
Didn't test it - so maybe ... Zitat:
![]() Unicode Transformation Format :mrgreen: greetings Oliver |
Re: Unicode search
Liste der Anhänge anzeigen (Anzahl: 1)
UTF-8 is a transcodet format that constains Unicode-Data.
Unicode has 2 bytes per char. I already found a mistake with myself:
Code:
but:
[color=#0000ff]{ read a buffer }[/color]
SetLength(Buf, i + [color=#ff0000](bytesToRead div 2)[/color]); Zitat:
[add] I re-worked the function/demo and still a UTF8 version with inserted. :angel: |
Alle Zeitangaben in WEZ +1. Es ist jetzt 06:43 Uhr. |
Powered by vBulletin® Copyright ©2000 - 2025, Jelsoft Enterprises Ltd.
LinkBacks Enabled by vBSEO © 2011, Crawlability, Inc.
Delphi-PRAXiS (c) 2002 - 2023 by Daniel R. Wolf, 2024-2025 by Thomas Breitkreuz