Delphi-PRAXiS

Delphi-PRAXiS (https://www.delphipraxis.net/forum.php)
-   Netzwerke (https://www.delphipraxis.net/14-netzwerke/)
-   -   Delphi Einfacher HTML-Parser (zum Testen) (https://www.delphipraxis.net/50325-einfacher-html-parser-zum-testen.html)

c.wuensch 23. Jul 2005 21:41


Einfacher HTML-Parser (zum Testen)
 
Hi,

ich möchte hier einmal einen recht einfachen HTML-Parser veröffentlichen, den ich geschrieben habe (hoffe, das ist die richtige Kategorie).

Im Gegensatz zu einem normalen Parser zerlegt dieser nur ein einziges Element in seine Einzelteile, und lässt den restlichen Code unverändert. Dieses Element kann dann über diverse Methoden verändert werden, und anschließend erhält man die veränderte HTML-Seite über die Funktion SaveToText wieder zurück.

Vielleicht kann ja jemand etwas damit anfangen...

Delphi-Quellcode:
unit HtmlParser_u;

interface

uses
  SysUtils, Dialogs, StrUtils;

type
  tHtmlAttribut = Record
    Option, Value, QuoteChar: String; HasValue: Boolean;
  End;
  tHtmlAttributes = Array Of tHtmlAttribut;

type
  tHtmlElement = class

  private
    zTextBefore: String;
    zTagName: String;
    zAttributes: tHtmlAttributes;
    zElementText: String;
    zHasCloseTag: Boolean;
    zTextAfter: String;

    function GetTextBefore: String;
    function GetTagName: String;
    function GetAttributes: tHtmlAttributes;
    function GetElementText: String;
    function GetHasCloseTag: Boolean;
    function GetTextAfter: String;

    procedure SetTextBefore (pTextBefore: String);
    procedure SetTagName (pTagName: String);
    procedure SetAttributes (pAttributes: tHtmlAttributes);
    procedure SetElementText (pElementText: String);
    procedure SetHasCloseTag (pHasCloseTag: Boolean);
    procedure SetTextAfter (pTextAfter: String);

  public
    constructor Create (HtmlText, TagName: String; FindById, FindLast: Boolean; FindNr: Integer);
    property TextBefore: String read GetTextBefore write SetTextBefore;
    property TagName: String read GetTagName write SetTagName;
    property Attributes: tHtmlAttributes read GetAttributes write SetAttributes;
    property ElementText: String read GetElementText write SetElementText;
    property HasCloseTag: Boolean read GetHasCloseTag write SetHasCloseTag;
    property TextAfter: String read GetTextAfter write SetTextAfter;

    procedure LoadFromText (HtmlText, TagName: String; FindById, FindLast: Boolean; FindNr: Integer);
    function SaveToText: String;
    function GetAttribute (Option: String): String;
    procedure SetAttribute (Option, Value: String; HasValue: Boolean);
    procedure RemoveAttribute (Option: String);
    procedure InsertText (InsertMode, NewText: String);
    procedure RemoveElement;
  end;


implementation

constructor tHtmlElement.Create (HtmlText, TagName: String; FindById, FindLast: Boolean; FindNr: Integer);
begin
  LoadFromText (HtmlText, TagName, FindById, FindLast, FindNr);
end;

function tHtmlElement.GetTextBefore: String;
begin
  Result := zTextBefore;
end;

function tHtmlElement.GetTagName: String;
begin
  Result := zTagName;
end;

function tHtmlElement.GetAttributes: tHtmlAttributes;
begin
  Result := zAttributes;
end;

function tHtmlElement.GetElementText: String;
begin
  Result := zElementText;
end;

function tHtmlElement.GetHasCloseTag: Boolean;
begin
  Result := zHasCloseTag;
end;

function tHtmlElement.GetTextAfter: String;
begin
  Result := zTextAfter;
end;

procedure tHtmlElement.SetTextBefore (pTextBefore: String);
begin
  zTextBefore := pTextBefore;
end;

procedure tHtmlElement.SetTagName (pTagName: String);
begin
  zTagName := pTagName;
end;

procedure tHtmlElement.SetAttributes (pAttributes: tHtmlAttributes);
begin
  zAttributes := pAttributes;
end;

procedure tHtmlElement.SetElementText (pElementText: String);
begin
  zElementText := pElementText;
end;

procedure tHtmlElement.SetHasCloseTag (pHasCloseTag: Boolean);
begin
  zHasCloseTag := pHasCloseTag;
end;

procedure tHtmlElement.SetTextAfter (pTextAfter: String);
begin
  zTextAfter := pTextAfter;
end;

procedure tHtmlElement.LoadFromText (HtmlText, TagName: String; FindById, FindLast: Boolean; FindNr: Integer);
const Delimiters = [' ', #9, #13, #10];
var LowerText, LowerTag: String; TagLength, i, hPos, Pos1, Pos2: integer;

  procedure ParseDelimiter;
  begin
    While (Pos1<Length(HtmlText)) And (HtmlText[Pos1] In Delimiters) Do Inc(Pos1);
  end;

  function ParseWord: String;
  begin
    While (Pos1<Length(HtmlText)) And Not (HtmlText[Pos1] In Delimiters + ['<','>']) Do Begin
      Result := Result + HtmlText[Pos1];
      Inc(Pos1);
    End;
  end;

  function ParseOption: String;
  begin
    While (Pos1<Length(HtmlText)) And (HtmlText[Pos1] In ['A'..'Z', 'a'..'z', '-', '_', '/']) Do Begin
      Result := Result + HtmlText[Pos1];
      Inc(Pos1);
    End;
  end;

  procedure ParseAttribut;
  var hOption, hValue, hQuoteChar: String; hHasValue: Boolean; hPos, AttrNr: Integer;
  begin
    ParseDelimiter;
    hOption := ParseOption;
    If hOption <> '' Then Begin
      ParseDelimiter;
      If HtmlText[Pos1] = '=' Then Begin
        hHasValue := True;
        Inc(Pos1);
        ParseDelimiter;
        If HtmlText[Pos1] In ['"', #39] Then Begin
          hQuoteChar := HtmlText[Pos1];
          Inc(Pos1);
          hPos := PosEx (hQuoteChar, HtmlText, Pos1);
          hValue := Copy (HtmlText, Pos1, hPos-Pos1);
          Pos1 := hPos + 1;
        End
        Else hValue := ParseWord;
      End
      Else Begin
        hHasValue := False;
        If HtmlText[Pos1] <> '>' Then Dec(Pos1);
      End;

      AttrNr := Length(zAttributes);
      SetLength (zAttributes, AttrNr+1);
      With zAttributes[AttrNr] Do Begin
        Option := hOption;
        Value := hValue;
        QuoteChar := hQuoteChar;
        HasValue := hHasValue;
      End;
    End;
  end;

begin
  zTextBefore := '';
  zTagName := '';
  zElementText := '';
  zTextAfter := '';
  SetLength (zAttributes, 0);

  If (HtmlText = '') Or (TagName = '') Then Exit;
  TagLength := Length (TagName);
  LowerText := LowerCase (HtmlText);
  LowerTag := LowerCase (TagName);

  // Element und TextBefore ermitteln
  hPos := 0;
  Pos1 := 0;
  i := FindNr;

  If FindById Then Begin
    Repeat
      hPos := PosEx ('id', LowerText, hPos+1);
      Pos2 := hPos;
      If Pos2 <> 0 Then Begin
        Inc(Pos2, 2);
        While LowerText[Pos2] In Delimiters + ['"', #39, '='] Do Inc(Pos2);
        If Copy (LowerText, Pos2, TagLength) = LowerTag Then Begin
          While Not (LowerText[Pos2] In ['<', '>']) Do Dec (Pos2);
          If LowerText[Pos2] = '<' Then Begin
            Pos1 := Pos2;
            Dec(i);
          End
          Else Continue;
        End
        Else Continue;
      End
      Else
        If Not FindLast Then Pos1 := 0;
    Until ((Not FindLast) And (i<=0)) Or (Pos2 = 0);
  End
  Else Begin
    Repeat
      hPos := PosEx ('<'+LowerTag, LowerText, hPos+1);
      If (hPos <> 0) Or (Not FindLast) Then Pos1 := hPos;
      Dec(i);
    Until ((Not FindLast) And (i<=0)) Or (hPos = 0);
  End;

  If Pos1 <> 0 Then Begin
    zTextBefore := Copy (HtmlText, 1, Pos1-1);
    hPos := Pos1+1;
    While Not (HtmlText[hPos] In Delimiters) Do Begin
      zTagName := zTagName + HtmlText[hPos];
      Inc(hPos);
    End;
    LowerTag := LowerCase (zTagName);
//    zTagName := Copy (HtmlText, Pos1+1, Length(TagName));
  End
  Else Exit; // Element nicht gefunden

  // Attribute ermitteln
  Pos1 := Pos1 + Length(zTagName) + 1;
  SetLength (zAttributes, 0);
  While HtmlText[Pos1] In Delimiters Do ParseAttribut;
  If HtmlText[Pos1] = '/' Then ParseAttribut;

  // ElementText ermitteln
  Pos2 := Pos1;
  hPos := Pos1;
  Inc (Pos1);
  Repeat
    Pos2 := PosEx ('</'+LowerTag+'>', LowerText, Pos2+1);
    hPos := PosEx ('<'+LowerTag, LowerText, hPos+1);
  Until (hPos = 0) Or (Pos2 <= hPos);
  If Pos2 = 0 Then Begin
    zHasCloseTag := False;
    zElementText := '';
  End
  Else Begin
    zHasCloseTag := True;
    zElementText := Copy (HtmlText, Pos1, Pos2 - Pos1);
  End;

  // TextAfter ermitteln
  If Pos1 < Pos2 Then Pos1 := Pos2 + Length (zTagName) + 3;
  zTextAfter := Copy (HtmlText, Pos1, Length(HtmlText)-Pos1+1);
end;

function tHtmlElement.SaveToText: String;
var hCloseTag: String;

  function AttrToString: String;
  var i: integer;
  begin
    For i:=0 To Length(zAttributes)-1 Do Begin
      With zAttributes[i] Do Begin
        Result := Result + ' ' + Option;
        If HasValue Then Result := Result + '=' + QuoteChar + Value + QuoteChar;
      End;
    End;
  end;

begin
  If zTagName <> '' Then Begin
    If zHasCloseTag Then hCloseTag := '</'+zTagName+'>';
    Result := zTextBefore + '<'+zTagName+AttrToString+'>' + zElementText + hCloseTag + zTextAfter;
  End
  Else Result := '';
end;

function tHtmlElement.GetAttribute (Option: String): String;
var LowerOption: String; i: integer;
begin
  LowerOption := LowerCase (Option);
  For i:=0 To Length(zAttributes)-1 Do Begin
    If LowerCase(zAttributes[i].Option) = LowerOption Then Begin
      Result := zAttributes[i].Value;
      Exit;
    End;
  End;
end;

procedure tHtmlElement.SetAttribute (Option, Value: String; HasValue: Boolean);
var LowerOption: String; i, AttrNr: integer;
begin
  LowerOption := LowerCase (Option);
  For i:=0 To Length(zAttributes)-1 Do Begin
    If LowerCase(zAttributes[i].Option) = LowerOption Then Begin
      zAttributes[i].Value := Value;
      Exit;
    End;
  End;

  AttrNr := Length (zAttributes);
  SetLength (zAttributes, AttrNr + 1);
  zAttributes[AttrNr].Option := Option;
  zAttributes[AttrNr].Value := Value;
  zAttributes[AttrNr].QuoteChar := '"';
  zAttributes[AttrNr].HasValue := HasValue;
end;

procedure tHtmlElement.RemoveAttribute (Option: String);
var LowerOption: String; i: integer;

  procedure DeleteArrayElement (pIndex: Integer);
  var i: Integer;
  begin
    If pIndex < High(zAttributes) Then Begin
      For i:=pIndex To High(zAttributes)-1 Do zAttributes[i] := zAttributes[i+1];
      SetLength (zAttributes, Length(zAttributes)-1);
    End;
  end;

begin
  LowerOption := LowerCase (Option);
  For i:=0 To Length(zAttributes)-1 Do Begin
    If LowerCase(zAttributes[i].Option) = LowerOption Then Begin
      DeleteArrayElement (i);
      Exit;
    End;
  End;
end;

procedure tHtmlElement.InsertText (InsertMode, NewText: String);
var LowerMode: String;
begin
  LowerMode := LowerCase (InsertMode);
  If LowerMode = 'beforebegin' Then zTextBefore := zTextBefore + #13#10 + NewText
  Else If LowerMode = 'afterbegin' Then zElementText := NewText + #13#10 + zElementText
  Else If LowerMode = 'beforeend' Then zElementText := zElementText + #13#10 + NewText
  Else If LowerMode = 'afterend' Then zTextAfter := NewText + #13#10 + zTextAfter
  Else Showmessage ('Ungültiges Argument: ' + InsertMode);
end;

procedure tHtmlElement.RemoveElement;
begin
  zTagName := '';
  zElementText := '';
  SetLength (zAttributes, 0);
end;

end.
Da ich noch nicht allzu lange mit Delphi programmiere, ist der Code möglicherweise noch ziemlich rudimentär.

Ich würde mich freuen, wenn einige Experten hier den Code einmal ansehen, und evtl. Verbesserungsvorschläge (v.a. bezüglich Performance) geben könnten.

Cu, Chris

[Edit]
Folgende Funktionen hinzugefügt:
- zu parsendes Element kann über Nummer festgelegt werden
- Element kann jetzt auch nach ID gesucht werden


Alle Zeitangaben in WEZ +1. Es ist jetzt 17:22 Uhr.

Powered by vBulletin® Copyright ©2000 - 2025, Jelsoft Enterprises Ltd.
LinkBacks Enabled by vBSEO © 2011, Crawlability, Inc.
Delphi-PRAXiS (c) 2002 - 2023 by Daniel R. Wolf, 2024-2025 by Thomas Breitkreuz