Thema: Delphi HTML2Text - noch buggy?

Einzelnen Beitrag anzeigen

bigg
(Gast)

n/a Beiträge
 

HTML2Text - noch buggy?

  Alt 13. Aug 2005, 00:05
moin,

ich habe leider keine bessere Funktion dazu gefunden,
es wäre schön, wenn jemand die Routine näher testen könnte.

Demo liegt bei:

Delphi-Quellcode:
function HTML2Char(const s: String): Char;
var i: Integer;
begin
Result := #0;

  if s <> 'then
  begin i := Length(s);
    if (i > 1) and (i < 7) then
    begin
      if s = 'quot'   then Result := '"';
      if s = 'amp'    then Result := '&';
      if s = 'lt'     then Result := '<';
      if s = 'gt'     then Result := '>';
      if s = 'nbsp'   then Result := #32;
      if s = 'iexcl'  then Result := '¡';
      if s = 'cent'   then Result := '¢';
      if s = 'pound'  then Result := '£';
      if s = 'currenthen Result := '¤';
      if s = 'yen'    then Result := '¥';
      if s = 'brvbarthen Result := '¦';
      if s = 'sect'   then Result := '§';
      if s = 'uml'    then Result := '¨';
      if s = 'copy'   then Result := '©';
      if s = 'ordf'   then Result := 'ª';
      if s = 'laquo'  then Result := '«';
      if s = 'not'    then Result := '¬';
      if s = 'shy'    then Result := '*';
      if s = 'reg'    then Result := '®';
      if s = 'macr'   then Result := '¯';
      if s = 'deg'    then Result := '°';
      if s = 'plusmnthen Result := '±';
      if s = 'sup2'   then Result := '²';
      if s = 'sup3'   then Result := '³';
      if s = 'acute'  then Result := '´';
      if s = 'micro'  then Result := 'µ';
      if s = 'para'   then Result := '';
      if s = 'middotthen Result := '·';
      if s = 'cedil'  then Result := '¸';
      if s = 'sup1'   then Result := '¹';
      if s = 'ordm'   then Result := 'º';
      if s = 'raquo'  then ResulT := '»';
      if s = 'frac14then Result := '¼';
      if s = 'frac12then Result := '½';
      if s = 'frac34then Result := '¾';
      if s = 'iquestthen Result := '¿';
      if s = 'Agravethen Result := 'À';
      if s = 'Aacutethen Result := 'Á';
      if s = 'Acirc'  then Result := 'Â';
      if s = 'Atildethen Result := 'Ã';
      if s = 'Auml'   then Result := 'Ä';
      if s = 'Aring'  then Result := 'Å';
      if s = 'Aelig'  then Result := 'Æ';
      if s = 'Ccedilthen Result := 'Ç';
      if s = 'Egravethen Result := 'È';
      if s = 'Eacutethen Result := 'É';
      if s = 'Ecirc'  then Result := 'Ê';
      if s = 'Euml'   then Result := 'Ë';
      if s = 'Igravethen Result := 'Ì';
      if s = 'Iacutethen Result := 'Í';
      if s = 'Icirc'  then Result := 'Î';
      if s = 'Iuml'   then Result := 'Ï';
      if s = 'Eth'    then Result := 'Ð';
      if s = 'Ntildethen Result := 'Ñ';
      if s = 'Ogravethen Result := 'Ò';
      if s = 'Oacutethen Result := 'Ó';
      if s = 'Ocirc'  then Result := 'Ô';
      if s = 'Otildethen Result := 'Õ';
      if s = 'Ouml'   then Result := 'Ö';
      if s = 'times'  then Result := '×';
      if s = 'Oslashthen Result := 'Ø';
      if s = 'Ugravethen Result := 'Ù';
      if s = 'Uacutethen Result := 'Ú';
      if s = 'Ucirc'  then Result := 'Û';
      if s = 'Uuml'   then Result := 'Ü';
      if s = 'Yacutethen Result := 'Ý';
      if s = 'thorn'  then Result := 'Þ';
      if s = 'szlig'  then Result := 'ß';
      if s = 'agravethen Result := 'à';
      if s = 'aacutethen Result := 'á';
      if s = 'acirc'  then Result := 'â';
      if s = 'atildethen Result := 'ã';
      if s = 'auml'   then Result := 'ä';
      if s = 'aring'  then Result := 'å';
      if s = 'aelig'  then Result := 'æ';
      if s = 'ccedilthen Result := 'ç';
      if s = 'egravethen Result := 'è';
      if s = 'eacutethen Result := 'é';
      if s = 'ecirc'  then Result := 'ê';
      if s = 'euml'   then Result := 'ë';
      if s = 'igravethen Result := 'ì';
      if s = 'iacutethen Result := 'í';
      if s = 'icirc'  then Result := 'î';
      if s = 'iuml'   then Result := 'ï';
      if s = 'eth'    then Result := 'ð';
      if s = 'ntildethen Result := 'ñ';
      if s = 'ogravethen Result := 'ò';
      if s = 'oacutethen Result := 'ó';
      if s = 'ocirc'  then Result := 'ô';
      if s = 'otildethen Result := 'õ';
      if s = 'ouml'   then Result := 'ö';
      if s = 'dividethen Result := '÷';
      if s = 'oslashthen Result := 'ø';
      if s = 'ugravethen Result := 'ù';
      if s = 'uacutethen Result := 'ú';
      if s = 'ucirc'  then Result := 'û';
      if s = 'uuml'   then Result := 'ü';
      if s = 'yacutethen Result := 'ý';
      if s = 'thorn'  then Result := 'þ';
      if s = 'yuml'   then Result := 'ÿ';
    end;
  end;
end;




function HTML2Text(const HTML: String): String;
const Forbidden: Set of Char = [#0, #10, #13, '&'];
var i, p, f, d: Integer;
var s, HtmlSymbol: String;
var HtmlChar: Char;
begin
Result := '';
i := 0;
p := 1;
d := 0;

  if HTML <> 'then
  begin SetLength(s, Length(HTML) + 1);
    repeat
      inc(i);

      if HTML[i] = '<then
      begin
        for f := p to i -1 do
        begin
          inc(d);
          s[d] := HTML[f];
        end;

        repeat
          inc(i);
        until (HTML[i] = #0) or (HTML[i] = '>');

        p := i + 1;
      end else
      begin
        p := i + 1;
        inc(d);
        s[d] := HTML[i];
      end;
    until HTML[i] = #0;


    if s <> 'then
    begin // entferne HTML-Sonderzeichen
      SetLength(Result, Length(s) + 1);
      i := 0;
      d := 0;

      repeat
        inc(i);

        if s[i] = '&then
        begin
         p := i;
         inc(i);

         repeat
           if s[i] = ';then
           begin
             HtmlSymbol := Copy( s, p + 1, i - p - 1);
             HtmlChar := HTML2Char(HtmlSymbol);

             if HtmlChar <> #0 then
             begin // HTML-Sonderzeichen

               for f := p to i - (Length(HtmlSymbol) + 3) do // Kopiere
               begin
                 inc(d);
                 Result[d] := s[f];
               end;

               inc(d);
               Result[d] := HtmlChar; // Ersetze
             end else
             begin // kein HTML-Sonderzeichen

               for f := p to i -1 do
               begin
                 inc(d);
                 Result[d] := s[f];
               end;
             end;

             Break;
           end;

           inc(i);
         until s[i] in Forbidden;
        end else
        begin
          inc(d);
          Result[d] := s[i];
        end;
      until s[i] = #0;
    end; // s <> ''
    
   end;
end;
EDIT:

Danke Matze

Ein Leerzeichen ist nicht #8 sondern #32.
Das der Text nicht eingerückt wird, ist übrigens absicht.

[edit=Chakotay1308]Die Diskussion zu diesem Source findet sich hier. Mfg, Chakotay1308[/edit]
Angehängte Dateien
Dateityp: zip html2text_153.zip (3,4 KB, 69x aufgerufen)
  Mit Zitat antworten Zitat