package Sisimai::String; use feature ':5.10'; use strict; use warnings; use Encode; use Encode::Guess; use Digest::SHA; sub EOM { # End of email message as a sentinel for parsing bounce messages # @private # @return [String] Fixed length string like a constant return '__END_OF_EMAIL_MESSAGE__'; } sub token { # Create the message token from an addresser and a recipient # @param [String] addr1 A sender's email address # @param [String] addr2 A recipient's email address # @param [Integer] epoch Machine time of the email bounce # @return [String] Message token(MD5 hex digest) or empty string # if the any argument is missing # @see http://en.wikipedia.org/wiki/ASCII # @see https://metacpan.org/pod/Digest::MD5 my $class = shift || return ''; my $addr1 = shift || return ''; my $addr2 = shift || return ''; my $epoch = shift // return ''; # Format: STX(0x02) Sender-Address RS(0x1e) Recipient-Address ETX(0x03) return Digest::SHA::sha1_hex( sprintf("\x02%s\x1e%s\x1e%d\x03", lc $addr1, lc $addr2, $epoch)); } sub is_8bit { # The argument is 8-bit text or not # @param [String] argv1 Any string to be checked # @return [Integer] 0: ASCII Characters only # 1: Including 8-bit character my $class = shift; my $argv1 = shift // return undef; return undef unless ref $argv1; return undef unless ref $argv1 eq 'SCALAR'; return 1 unless $$argv1 =~ /\A[\x00-\x7f]+\z/; return 0; } sub sweep { # Clean the string out # @param [String] argv1 String to be cleaned # @return [Scalar] Cleaned out string # @example Clean up text # sweep(' neko ') #=> 'neko' my $class = shift; my $argv1 = shift // return undef; chomp $argv1; $argv1 =~ y/ //s; $argv1 =~ y/\t//d; $argv1 =~ s/\A //g; $argv1 =~ s/ \z//g; $argv1 =~ s/ [-]{2,}[^ \t].+\z//; return $argv1; } sub to_plain { # Convert given HTML text to plain text # @param [Scalar] argv1 HTML text(reference to string) # @param [Integer] loose Loose check flag # @return [Scalar] Plain text(reference to string) my $class = shift; my $argv1 = shift // return \''; my $loose = shift // 0; return \'' unless ref $argv1; return \'' unless ref $argv1 eq 'SCALAR'; my $plain = $$argv1; my $match = { 'html' => qr|].+?|sim, 'body' => qr|.+.*].+|sim, }; if( $loose || $plain =~ $match->{'html'} || $plain =~ $match->{'body'} ) { # ... # 1. Remove ... # 2. Remove # 3. ... to " http://... " # 4. ... to " Value " $plain =~ s|.+||gsim; $plain =~ s|.+||gsim; $plain =~ s|(.*?)| [$2]($1) |gsim; $plain =~ s|(.*?)| [$2](mailto:$1) |gsim; $plain =~ s/<[^<@>]+?>\s*/ /g; # Delete HTML tags except $plain =~ s/<//g; # Convert to right angle brackets $plain =~ s/&/&/g; # Convert to "&" $plain =~ s/"/"/g; # Convert to '"' $plain =~ s/'/'/g; # Convert to "'" $plain =~ s/ / /g; # Convert to ' ' if( length($$argv1) > length($plain) ) { $plain =~ y/ //s; $plain .= "\n" } } return \$plain; } sub to_utf8 { # Convert given string to UTF-8 # @param [String] argv1 String to be converted # @param [String] argv2 Encoding name before converting # @return [String] UTF-8 Encoded string my $class = shift; my $argv1 = shift || return \''; my $argv2 = shift; my $tobeutf8ed = $$argv1; my $encodefrom = lc $argv2 || ''; my $hasencoded = undef; my $hasguessed = Encode::Guess->guess($tobeutf8ed); my $encodingto = ref $hasguessed ? lc($hasguessed->name) : ''; my $dontencode = qr/\A(?>utf[-]?8|(?:us[-])?ascii)\z/; if( $encodefrom ) { # The 2nd argument is a encoding name of the 1st argument while(1) { # Encode a given string when the encoding of the string is neigther # utf8 nor ascii. last if $encodefrom =~ $dontencode; last if $encodingto =~ $dontencode; eval { # Try to convert the string to UTF-8 Encode::from_to($tobeutf8ed, $encodefrom, 'utf8'); $hasencoded = 1; }; last; } } unless( $hasencoded ) { # The 2nd argument was not given or failed to convert from $encodefrom # to UTF-8 if( $encodingto ) { # Guessed encoding name is available, try to encode using it. unless( $encodingto =~ $dontencode ) { # Encode a given string when the encoding of the string is neigther # utf8 nor ascii. eval { Encode::from_to($tobeutf8ed, $encodingto, 'utf8'); $hasencoded = 1; }; } } } return \$tobeutf8ed; } 1; __END__ =encoding utf-8 =head1 NAME Sisimai::String - String related class =head1 SYNOPSIS use Sisimai::String; my $s = 'envelope-sender@example.jp'; my $r = 'envelope-recipient@example.org'; my $t = time(); print Sisimai::String->token($s, $r, $t); # 2d635de42a44c54b291dda00a93ac27b print Sisimai::String->is_8bit(\'猫'); # 1 print Sisimai::String->sweep(' neko cat ');# 'neko cat' print Sisimai::String->to_utf8('^[$BG-^[(B', 'iso-2022-jp'); # 猫 print Sisimai::String->to_plain('neko'); # neko =head1 DESCRIPTION Sisimai::String provide utilities for dealing string =head1 CLASS METHODS =head2 C, I)>> C generates a token: Unique string generated by an envelope sender address and a envelope recipient address. my $s = 'envelope-sender@example.jp'; my $r = 'envelope-recipient@example.org'; print Sisimai::String->token($s, $r); # 2d635de42a44c54b291dda00a93ac27b =head2 C)>> C checks the argument include any 8bit character or not. print Sisimai::String->is_8bit(\'cat'); # 0; print Sisimai::String->is_8bit(\'ねこ'); # 1; =head2 C)>> C clean the argument string up: remove trailing spaces, squeeze spaces. print Sisimai::String->sweep(' cat neko '); # 'cat neko'; print Sisimai::String->sweep(' nyaa !!'); # 'nyaa !!'; =head2 C, [I])>> C converts given string to UTF-8. my $v = '^[$BG-^[(B'; # ISO-2022-JP print Sisimai::String->to_utf8($v, 'iso-2022-jp'); # 猫 =head2 C, [I])>> C converts given string as HTML to plain text. my $v = 'neko'; print Sisimai::String->to_plain($v); # neko =head1 AUTHOR azumakuniyuki =head1 COPYRIGHT Copyright (C) 2014-2016,2018 azumakuniyuki, All rights reserved. =head1 LICENSE This software is distributed under The BSD 2-Clause License. =cut