and(.+?) } { my $Ascii = $Self->ToAscii( String => $2, ); # force line breaking if ( length $Ascii > $LineLength ) { $Ascii =~ s/(.{4,$LineLength})(?:\s|\z)/$1\n/gm; } $Ascii =~ s/^(.*?)$/> $1/gm; $Counter++; my $Key = "######Cite::$Counter######"; $Cite{$Key} = $Ascii; $Key; }segxmi; $Param{String} =~ s{tags and replace it with \n $Param{String} =~ s/<\/(\s{0,3})div>/\n/gsi; # remove hr tags and replace it with \n $Param{String} =~ s/\<(hr|hr.+?)\>/\n\n/gsi; # remove p, table tags and replace it with \n $Param{String} =~ s/\<(\/|)(p|p.+?|table|table.+?)\>/\n\n/gsi; # remove opening tr, th tags and replace them with \n $Param{String} =~ s/\<(tr|tr.+?|th|th.+?)\>/\n\n/gsi; # convert li tags to \n - $Param{String} =~ s/\(.+?)} { my $Ascii = $Self->ToAscii( String => $1, ); # force line breaking if ( length $Ascii > $LineLength ) { $Ascii =~ s/(.{4,$LineLength})(?:\s|\z)/$1\n/gm; } $Ascii =~ s/^(.*?)$/> $1/gm; $Counter++; my $Key = "######Cite::$Counter######"; $Cite{$Key} = $Ascii; $Key; }segxmi; # rememberandtags my %One2One; $Counter = 0; $Param{String} =~ s{ <(pre|code)(.*?)>(.+?)(pre|code)(.*?)> } { my $Content = $3; $Counter++; my $Key = "######One2One::$Counter######"; $One2One{$Key} = $Content; $Key; }segxmi; # remove comments at the first place to avoid to much work # for the regex engine $Param{String} =~ s{}{}xmgsi; # remove empty lines $Param{String} =~ s/^\s*//mg; # fix some bad stuff from opera and others $Param{String} =~ s/(\n\r|\r\r\n|\r\n)/\n/gs; # remove new line after
$Param{String} =~ s/(\
)(\n|\r)/$1/gsi; # replace new lines with one space $Param{String} =~ s/\n/ /gs; $Param{String} =~ s/\r/ /gs; # remove style tags $Param{String} =~ s{]*>}{}xgsi; # remove
,
,
,
, tags and replace it with \n $Param{String} =~ s/\
/\n/gsi; # remove/\n - /gsi; # convert and tags to \n\n $Param{String} =~ s/\<\/(ul|ol)\>/\n\n/gsi; # remove tags and replace them with " " $Param{String} =~ s/<\/td[^>]*>/ /gsi; # replace multiple spaces with just one space $Param{String} =~ s/[ ]{2,}/ /mg; # remember andtags and replace it for my $Key ( sort keys %One2One ) { $Param{String} =~ s/$Key/\n\n\n$One2One{$Key}\n\n/g; } # strip all other tags $Param{String} =~ s/\<.+?\>//gs; # html encode based on cpan's HTML::Entities v1.35 my %Entity = ( # Some normal chars that have special meaning in SGML context amp => '&', # ampersand 'gt' => '>', # greater than 'lt' => '<', # less than quot => '"', # double quote apos => "'", # single quote # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML AElig => chr(198), # capital AE diphthong (ligature) Aacute => chr(193), # capital A, acute accent Acirc => chr(194), # capital A, circumflex accent Agrave => chr(192), # capital A, grave accent Aring => chr(197), # capital A, ring Atilde => chr(195), # capital A, tilde Auml => chr(196), # capital A, dieresis or umlaut mark Ccedil => chr(199), # capital C, cedilla ETH => chr(208), # capital Eth, Icelandic Eacute => chr(201), # capital E, acute accent Ecirc => chr(202), # capital E, circumflex accent Egrave => chr(200), # capital E, grave accent Euml => chr(203), # capital E, dieresis or umlaut mark Iacute => chr(205), # capital I, acute accent Icirc => chr(206), # capital I, circumflex accent Igrave => chr(204), # capital I, grave accent Iuml => chr(207), # capital I, dieresis or umlaut mark Ntilde => chr(209), # capital N, tilde Oacute => chr(211), # capital O, acute accent Ocirc => chr(212), # capital O, circumflex accent Ograve => chr(210), # capital O, grave accent Oslash => chr(216), # capital O, slash Otilde => chr(213), # capital O, tilde Ouml => chr(214), # capital O, dieresis or umlaut mark THORN => chr(222), # capital THORN, Icelandic Uacute => chr(218), # capital U, acute accent Ucirc => chr(219), # capital U, circumflex accent Ugrave => chr(217), # capital U, grave accent Uuml => chr(220), # capital U, dieresis or umlaut mark Yacute => chr(221), # capital Y, acute accent aacute => chr(225), # small a, acute accent acirc => chr(226), # small a, circumflex accent aelig => chr(230), # small ae diphthong (ligature) agrave => chr(224), # small a, grave accent aring => chr(229), # small a, ring atilde => chr(227), # small a, tilde auml => chr(228), # small a, dieresis or umlaut mark ccedil => chr(231), # small c, cedilla eacute => chr(233), # small e, acute accent ecirc => chr(234), # small e, circumflex accent egrave => chr(232), # small e, grave accent eth => chr(240), # small eth, Icelandic euml => chr(235), # small e, dieresis or umlaut mark iacute => chr(237), # small i, acute accent icirc => chr(238), # small i, circumflex accent igrave => chr(236), # small i, grave accent iuml => chr(239), # small i, dieresis or umlaut mark ntilde => chr(241), # small n, tilde oacute => chr(243), # small o, acute accent ocirc => chr(244), # small o, circumflex accent ograve => chr(242), # small o, grave accent oslash => chr(248), # small o, slash otilde => chr(245), # small o, tilde ouml => chr(246), # small o, dieresis or umlaut mark szlig => chr(223), # small sharp s, German (sz ligature) thorn => chr(254), # small thorn, Icelandic uacute => chr(250), # small u, acute accent ucirc => chr(251), # small u, circumflex accent ugrave => chr(249), # small u, grave accent uuml => chr(252), # small u, dieresis or umlaut mark yacute => chr(253), # small y, acute accent yuml => chr(255), # small y, dieresis or umlaut mark # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96) copy => chr(169), # copyright sign reg => chr(174), # registered sign nbsp => chr(160), # non breaking space # Additional ISO-8859/1 entities listed in rfc1866 (section 14) iexcl => chr(161), cent => chr(162), pound => chr(163), curren => chr(164), yen => chr(165), brvbar => chr(166), sect => chr(167), uml => chr(168), ordf => chr(170), laquo => chr(171), 'not' => chr(172), # not is a keyword in perl shy => chr(173), macr => chr(175), deg => chr(176), plusmn => chr(177), sup1 => chr(185), sup2 => chr(178), sup3 => chr(179), acute => chr(180), micro => chr(181), para => chr(182), middot => chr(183), cedil => chr(184), ordm => chr(186), raquo => chr(187), frac14 => chr(188), frac12 => chr(189), frac34 => chr(190), iquest => chr(191), 'times' => chr(215), # times is a keyword in perl divide => chr(247), ( $] > 5.007 ? ( OElig => chr(338), oelig => chr(339), Scaron => chr(352), scaron => chr(353), Yuml => chr(376), fnof => chr(402), circ => chr(710), tilde => chr(732), Alpha => chr(913), Beta => chr(914), Gamma => chr(915), Delta => chr(916), Epsilon => chr(917), Zeta => chr(918), Eta => chr(919), Theta => chr(920), Iota => chr(921), Kappa => chr(922), Lambda => chr(923), Mu => chr(924), Nu => chr(925), Xi => chr(926), Omicron => chr(927), Pi => chr(928), Rho => chr(929), Sigma => chr(931), Tau => chr(932), Upsilon => chr(933), Phi => chr(934), Chi => chr(935), Psi => chr(936), Omega => chr(937), alpha => chr(945), beta => chr(946), gamma => chr(947), delta => chr(948), epsilon => chr(949), zeta => chr(950), eta => chr(951), theta => chr(952), iota => chr(953), kappa => chr(954), lambda => chr(955), mu => chr(956), nu => chr(957), xi => chr(958), omicron => chr(959), pi => chr(960), rho => chr(961), sigmaf => chr(962), sigma => chr(963), tau => chr(964), upsilon => chr(965), phi => chr(966), chi => chr(967), psi => chr(968), omega => chr(969), thetasym => chr(977), upsih => chr(978), piv => chr(982), ensp => chr(8194), emsp => chr(8195), thinsp => chr(8201), zwnj => chr(8204), zwj => chr(8205), lrm => chr(8206), rlm => chr(8207), ndash => chr(8211), mdash => chr(8212), lsquo => chr(8216), rsquo => chr(8217), sbquo => chr(8218), ldquo => chr(8220), rdquo => chr(8221), bdquo => chr(8222), dagger => chr(8224), Dagger => chr(8225), bull => chr(8226), hellip => chr(8230), permil => chr(8240), prime => chr(8242), Prime => chr(8243), lsaquo => chr(8249), rsaquo => chr(8250), oline => chr(8254), frasl => chr(8260), euro => chr(8364), image => chr(8465), weierp => chr(8472), real => chr(8476), trade => chr(8482), alefsym => chr(8501), larr => chr(8592), uarr => chr(8593), rarr => chr(8594), darr => chr(8595), harr => chr(8596), crarr => chr(8629), lArr => chr(8656), uArr => chr(8657), rArr => chr(8658), dArr => chr(8659), hArr => chr(8660), forall => chr(8704), part => chr(8706), exist => chr(8707), empty => chr(8709), nabla => chr(8711), isin => chr(8712), notin => chr(8713), ni => chr(8715), prod => chr(8719), sum => chr(8721), minus => chr(8722), lowast => chr(8727), radic => chr(8730), prop => chr(8733), infin => chr(8734), ang => chr(8736), 'and' => chr(8743), 'or' => chr(8744), cap => chr(8745), cup => chr(8746), 'int' => chr(8747), there4 => chr(8756), sim => chr(8764), cong => chr(8773), asymp => chr(8776), 'ne' => chr(8800), equiv => chr(8801), 'le' => chr(8804), 'ge' => chr(8805), 'sub' => chr(8834), sup => chr(8835), nsub => chr(8836), sube => chr(8838), supe => chr(8839), oplus => chr(8853), otimes => chr(8855), perp => chr(8869), sdot => chr(8901), lceil => chr(8968), rceil => chr(8969), lfloor => chr(8970), rfloor => chr(8971), lang => chr(9001), rang => chr(9002), loz => chr(9674), spades => chr(9824), clubs => chr(9827), hearts => chr(9829), diams => chr(9830), ) : () ) ); # encode html entities like "–" $Param{String} =~ s{ (&\#(\d+);?) } { my $ChrOrig = $1; my $Dec = $2; # Don't process UTF-16 surrogate pairs. Used on their own, these are not valid UTF-8 code # points and can result in errors in old Perl versions. See bug#12588 for more information. # - High Surrogate codes (U+D800-U+DBFF) # - Low Surrogate codes (U+DC00-U+DFFF) if ( $Dec >= 55296 && $Dec <= 57343 ) { $ChrOrig; } else { my $Chr = chr($Dec); # Make sure we get valid UTF8 code points, but skip characters from 128 to 255 # (inclusive), since they are by default internally not encoded as UTF-8 for # backward compatibility reasons. See bug#12457 for more information. if ( $Dec < 128 || $Dec> 255 ) { Encode::_utf8_off($Chr); $Chr = Encode::decode('utf-8', $Chr, 0); } if ( $Chr ) { $Chr; } else { $ChrOrig; } } }egx; # encode html entities like "=" $Param{String} =~ s{ (&\#[xX]([0-9a-fA-F]+);?) } { my $ChrOrig = $1; my $Dec = hex( $2 ); # Don't process UTF-16 surrogate pairs. Used on their own, these are not valid UTF-8 code # points and can result in errors in old Perl versions. See bug#12588 for more information. # - High Surrogate codes (U+D800-U+DBFF) # - Low Surrogate codes (U+DC00-U+DFFF) if ( $Dec >= 55296 && $Dec <= 57343 ) { $ChrOrig; } else { if ( $Dec ) { my $Chr = chr( $Dec ); # Make sure we get valid UTF8 code points, but skip characters from 128 to 255 # (inclusive), since they are by default internally not encoded as UTF-8 for # backward compatibility reasons. See bug#12457 for more information. if ( $Dec < 128 || $Dec > 255 ) { Encode::_utf8_off($Chr); $Chr = Encode::decode('utf-8', $Chr, 0); } if ( $Chr ) { $Chr; } else { $ChrOrig; } } else { $ChrOrig; } } }egx; # encode html entities like "&" $Param{String} =~ s{ (&(\w+);?) } { if ( $Entity{$2} ) { $Entity{$2}; } else { $1; } }egx; # remove empty lines $Param{String} =~ s/^\s*\n\s*\n/\n/mg; # force line breaking if ( length $Param{String} > $LineLength ) { $Param{String} =~ s/(.{4,$LineLength})(?:\s|\z)/$1\n/gm; } # rememberandToHTML( String => $String, ReplaceDoubleSpace => 0, # replace with " ", optional 1 or 0 (defaults to 1) ); =cut sub ToHTML { my ( $Self, %Param ) = @_; # check needed stuff for (qw(String)) { if ( !defined $Param{$_} ) { $Kernel::OM->Get('Kernel::System::Log')->Log( Priority => 'error', Message => "Need $_!" ); return; } } # fix some bad stuff from opera and others $Param{String} =~ s/(\n\r|\r\r\n|\r\n)/\n/gs; $Param{String} =~ s/&/&/g; $Param{String} =~ s/</g; $Param{String} =~ s/>/>/g; $Param{String} =~ s/"/"/g; $Param{String} =~ s/(\n|\r)/
\n/g; $Param{String} =~ s/ / /g if $Param{ReplaceDoubleSpace}; return $Param{String}; } =head2 DocumentComplete() check and e. g. add and tags to given html string my $HTMLDocument = $HTMLUtilsObject->DocumentComplete( String => $String, Charset => $Charset, ); =cut sub DocumentComplete { my ( $Self, %Param ) = @_; # check needed stuff for (qw(String Charset)) { if ( !defined $Param{$_} ) { $Kernel::OM->Get('Kernel::System::Log')->Log( Priority => 'error', Message => "Need $_!" ); return; } } return $Param{String} if $Param{String} =~ //i; my $Css = $Kernel::OM->Get('Kernel::Config')->Get('Frontend::RichText::DefaultCSS') || 'font-size: 12px; font-family:Courier,monospace,fixed;'; # escape special characters like double-quotes, e.g. used in font names with spaces $Css = $Self->ToHTML( String => $Css ); # Use the HTML5 doctype because it is compatible with HTML4 and causes the browsers # to render the content in standards mode, which is more safe than quirks mode. my $Body = ''; $Body .= ''; $Body .= '' . $Param{String} . ''; return $Body; } =head2 DocumentStrip() remove html document tags from string my $HTMLString = $HTMLUtilsObject->DocumentStrip( String => $String, ); =cut sub DocumentStrip { my ( $Self, %Param ) = @_; # check needed stuff for (qw(String)) { if ( !defined $Param{$_} ) { $Kernel::OM->Get('Kernel::System::Log')->Log( Priority => 'error', Message => "Need $_!" ); return; } } $Param{String} =~ s/^<\!DOCTYPE\s+HTML.+?>//gsi; $Param{String} =~ s/.+?<\/head>//gsi; $Param{String} =~ s/<(html|body)(.*?)>//gsi; $Param{String} =~ s/<\/(html|body)>//gsi; return $Param{String}; } =head2 DocumentCleanup() perform some sanity checks on HTML content. - Replace MS Word 12with class "MsoNormal" by using
because it's not used as(margin:0cm; margin-bottom:.0001pt;). - Replaceby using "" because of cross mail client and browser compatibility. - If there is no HTML doctype present, inject the HTML5 doctype, because it is compatible with HTML4 and causes the browsers to render the content in standards mode, which is safer. $HTMLBody = $HTMLUtilsObject->DocumentCleanup( String => $HTMLBody, ); =cut sub DocumentCleanup { my ( $Self, %Param ) = @_; # check needed stuff for (qw(String)) { if ( !defined $Param{$_} ) { $Kernel::OM->Get('Kernel::System::Log')->Log( Priority => 'error', Message => "Need $_!" ); return; } } # If the string starts with directly, inject the doctype $Param{String} =~ s{ \A \s* tags - see bug#8880 $Param{String} =~ s{}{}xmsi; # replace MS Word 12 with class "MsoNormal" by using
because # it's not used as(margin:0cm; margin-bottom:.0001pt;) $Param{String} =~ s{(.+?)
} { $4 . '
'; }segxmi; $Param{String} =~ s{(.+?)} { $4 . '
'; }segxmi; # replaceby using # "" # because of cross mail client and browser compatability my $Style = "border:none;border-left:solid blue 1.5pt;padding:0cm 0cm 0cm 4.0pt"; for ( 1 .. 10 ) { $Param{String} =~ s{(.+?)} { "$2"; }segxmi; } return $Param{String}; } =head2 LinkQuote() detect links in HTML code, add C if missing my $HTMLWithLinks = $HTMLUtilsObject->LinkQuote( String => $HTMLString, Target => 'TargetName', # content of target="?", e. g. _blank TargetAdd => 1, # add target="_blank" to all existing "LinkQuote( String => \$HTMLStringRef, ); =cut sub LinkQuote { my ( $Self, %Param ) = @_; my $String = $Param{String} || ''; # check ref my $StringScalar; if ( !ref $String ) { $StringScalar = $String; $String = \$StringScalar; # return if string is not a ref and it is empty return $StringScalar if !$StringScalar; } # add target to already existing url of html string if ( $Param{TargetAdd} ) { # find target my $Target = $Param{Target}; if ( !$Target ) { $Target = '_blank'; } # add target to existing "]+)> } { my $Start = $1; my $Value = $2; if ( $Value !~ /href=/i || $Value =~ /target=/i ) { "$Start$Value>"; } else { "$Start$Value target=\"$Target\">"; } }egxsi; } my $Marker = "§" x 10; # Remove existing ... tags and their content to be re-inserted later, this must not be quoted. # Also remove other tags to avoid quoting in tag parameters. my $Counter = 0; my %TagHash; ${$String} =~ s{ (]*?>[^>]*|<[^>]+?>) } { my $Content = $1; my $Key = "${Marker}TagHash-$Counter${Marker}"; $TagHash{$Counter++} = $Content; $Key; }egxism; # Add tags for URLs in the content. my $Target = ''; if ( $Param{Target} ) { $Target = " target=\"$Param{Target}\""; } ${$String} =~ s{ ( # $1 greater-than and less-than sign > | < | \s+ | §{10} | (?: &[a-zA-Z0-9]+; ) # get html entities ) ( # $2 (?: # http or only www (?: (?: http s? | ftp ) :\/\/) | # http://,https:// and ftp:// (?: (?: www | ftp ) \.) # www. and ftp. ) ) ( # $3 (?: [a-z0-9\-]+ \. )* # get subdomains, optional [a-z0-9\-]+ # get top level domain (?: # optional port number [:] [0-9]+ )? (?: # file path element [\/\.] | [a-zA-Z0-9\-_=%] )* (?: # param string [\?] # if param string is there, "?" must be present [a-zA-Z0-9&;=%\-_:\.\/]* # param string content, this will also catch entities like & )? (?: # link hash string [\#] # [a-zA-Z0-9&;=%\-_:\.\/]* # hash string content, this will also catch entities like & )? ) (?= # $4 (?: [\?,;!\.\)] (?: \s | $ ) # \)\s this construct is because of bug# 2450 | \" | \] | \s+ | ' | > # greater-than and less-than sign | < # " | (?: &[a-zA-Z0-9]+; )+ # html entities | $ # bug# 2715 ) | §{10} # ending TagHash ) } { my $Start = $1; my $Protocol = $2; my $Link = $3; my $End = $4 || ''; # there may different links for href and link body my $HrefLink; my $DisplayLink; if ( $Protocol =~ m{\A ( http | https | ftp ) : \/ \/ }xi ) { $DisplayLink = $Protocol . $Link; $HrefLink = $DisplayLink; } else { if ($Protocol =~ m{\A ftp }smx ) { $HrefLink = 'ftp://'; } else { $HrefLink = 'http://'; } if ( $Protocol ) { $HrefLink .= $Protocol; $DisplayLink = $Protocol; } $DisplayLink .= $Link; $HrefLink .= $Link; } $Start . "$DisplayLink<\/a>" . $End; }egxism; # Re-add previously removed tags. ${$String} =~ s{${Marker}TagHash-(\d+)${Marker}}{$TagHash{$1}}egsxim; # check ref && return result like called if ( defined $StringScalar ) { return ${$String}; } return $String; } =head2 Safety() To remove/strip active html tags/addons (javascript, C