--- lib/Mail/SpamAssassin/HTML.pm 2015-04-29 04:56:49.000000000 +0900 +++ lib/Mail/SpamAssassin/HTML.pm 2015-08-30 00:46:40.902000000 +0900 @@ -695,7 +695,8 @@ } else { # NBSP: UTF-8: C2 A0, ISO-8859-*: A0 - $text =~ s/[ \t\n\r\f\x0b]+|\xc2\xa0/ /gs; + # Ideographic Space: UTF-8: E3 80 80 + $text =~ s/[ \t\n\r\f\x0b]+|(?:\xc2\xa0)+|(?:\xe3\x80\x80)+/ /gs; # trim leading whitespace if previous element was whitespace # and current element is not invisible if (@{ $self->{text} } && !$display{invisible} && @@ -742,7 +743,8 @@ my $invisible_for_bayes = 0; # NBSP: UTF-8: C2 A0, ISO-8859-*: A0 - if ($text !~ /^(?:[ \t\n\r\f\x0b]|\xc2\xa0)*\z/s) { + # Ideographic Space: UTF-8: E3 80 80 + if ($text !~ /^(?:[ \t\n\r\f\x0b]|\xc2\xa0|\xe3\x80\x80)*\z/s) { $invisible_for_bayes = $self->html_font_invisible($text); } --- lib/Mail/SpamAssassin/Message/Node.pm 2015-04-29 04:56:48.000000000 +0900 +++ lib/Mail/SpamAssassin/Message/Node.pm 2015-08-30 00:25:32.534000000 +0900 @@ -44,6 +44,7 @@ use Mail::SpamAssassin::Constants qw(:sa); use Mail::SpamAssassin::HTML; use Mail::SpamAssassin::Logger; +use Mail::SpamAssassin::Util::Charset; our($enc_utf8, $enc_w1252, $have_encode_detector); BEGIN { @@ -407,6 +408,10 @@ return $_[1] unless $self->{normalize} && $enc_utf8; + # FIXME: to be merged. + my ($decoded_data, $charset_detected) = normalize_charset($_[1], $charset_declared, $return_decoded); + return wantarray ? ($decoded_data, $charset_detected) : $decoded_data; + warn "message: _normalize() was given characters, expected bytes: $_[1]\n" if utf8::is_utf8($_[1]); @@ -603,6 +608,7 @@ my $text = $self->decode; # QP and Base64 decoding, bytes my $text_len = length($text); # num of bytes in original charset encoding + my $charset; # render text/html always, or any other text|text/plain part as text/html # based on a heuristic which simulates a certain common mail client @@ -622,7 +628,9 @@ # subroutine _normalize() to return Unicode text. See Bug 7133 # $character_semantics = 1; # $text will be in characters - $text = $self->_normalize($text, $self->{charset}, 1); # bytes to chars + ($text, $charset) = $self->_normalize($text, $self->{charset}, 1); # bytes to chars + $self->{charset} = $charset; + $self->{language} = get_language($text, $charset); } elsif (!defined $self->{charset} || $self->{charset} =~ /^(?:US-ASCII|UTF-8)\z/i) { # With some luck input can be interpreted as UTF-8, do not warn. @@ -657,7 +665,9 @@ else { # plain text if ($self->{normalize} && $enc_utf8) { # request transcoded result as UTF-8 octets! - $text = $self->_normalize($text, $self->{charset}, 0); + ($text, $charset) = $self->_normalize($text, $self->{charset}, 0); + $self->{charset} = $charset; + $self->{language} = get_language($text, $charset); } $self->{rendered_type} = $self->{type}; $self->{rendered} = $self->{'visible_rendered'} = $text; --- lib/Mail/SpamAssassin/Message.pm 2015-04-29 04:56:49.000000000 +0900 +++ lib/Mail/SpamAssassin/Message.pm 2015-08-30 00:52:32.210000000 +0900 @@ -627,6 +627,8 @@ delete $self->{'pristine_headers'}; delete $self->{'line_ending'}; delete $self->{'missing_head_body_separator'}; + delete $self->{'charset'}; + delete $self->{'language'}; my @toclean = ( $self ); @@ -653,6 +655,8 @@ delete $part->{'invisible_rendered'}; delete $part->{'type'}; delete $part->{'rendered_type'}; + delete $self->{'charset'}; + delete $self->{'language'}; # if there are children nodes, add them to the queue of nodes to clean up if (exists $part->{'body_parts'}) { @@ -1143,6 +1147,9 @@ # whitespace handling (warning: small changes have large effects!) $text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed # $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace (incl. VT, NBSP) => space + if ($self->{normalize}) { + $text =~ s/\xc2\xa0|\xe3\x80\x80/ /g; # whitespace (NBSP, ideographic space) => space + } $text =~ tr/ \t\n\r\x0b/ /s; # whitespace (incl. VT) => space $text =~ tr/\f/\n/; # form feeds => newline @@ -1235,6 +1242,27 @@ } # --------------------------------------------------------------------------- + +sub get_language { + my ($self) = @_; + + if (defined $self->{language}) { return $self->{language}; } + my @parts = $self->find_parts(qr/^(?:text|message)\b/i,1); + return '' unless @parts; + + # Go through each part + my @langs; + for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) { + my $p = $parts[$pt]; + my $lang = $p->{language}; + next unless ($lang); + push(@langs, $lang) unless (grep(/^$lang$/, @langs)) + } + $self->{language} = scalar(@langs) ? join(' ', @langs) : ''; + return $self->{language}; +} + +# --------------------------------------------------------------------------- 1; --- lib/Mail/SpamAssassin/PerMsgStatus.pm 2015-04-29 04:56:49.000000000 +0900 +++ lib/Mail/SpamAssassin/PerMsgStatus.pm 2015-08-30 00:55:35.583000000 +0900 @@ -55,6 +55,7 @@ use Errno qw(ENOENT); use Time::HiRes qw(time); +use Encode; use Mail::SpamAssassin::Constants qw(:sa); use Mail::SpamAssassin::AsyncLoop; @@ -1053,12 +1054,32 @@ # the SpamAssassin report my $report = $self->get_report(); - # If there are any wide characters, need to MIME-encode in UTF-8 - # TODO: If $report_charset is something other than iso-8859-1/us-ascii, then - # we could try converting to that charset if possible - unless ($] < 5.008 || utf8::downgrade($report, 1)) { + # decode to utf-8. + my $is_utf8 = 0; + if ($self->{conf}->{normalize_charset}) { + $report = Encode::decode_utf8($report); + $is_utf8 = 1; + } + else { + if ($self->{msg}->{charset}) { + eval { + my $scratch = $report; + $report = Encode::decode($self->{msg}->{charset},$scratch,Encode::FB_CROAK); + $is_utf8 = 1; + }; + } + } + # encode to report_charset. encode to utf-8 if charset conversion fail. + if ($is_utf8) { + eval { + my $scratch = $report; + $report = Encode::encode($self->{conf}->{report_charset},$scratch,Encode::FB_CROAK); + $is_utf8 = 0; + }; + if ($is_utf8) { + $report = Encode::encode_utf8($report); $report_charset = "; charset=utf-8"; - utf8::encode($report); + } } # get original headers, "pristine" if we can do it --- lib/Mail/SpamAssassin/Plugin/Bayes.pm 2015-04-29 04:56:47.000000000 +0900 +++ lib/Mail/SpamAssassin/Plugin/Bayes.pm 2015-08-30 00:25:43.443000000 +0900 @@ -70,6 +70,7 @@ $MARK_PRESENCE_ONLY_HDRS %HEADER_NAME_COMPRESSION $OPPORTUNISTIC_LOCK_VALID + $SKIP_UTF8_SHORT_TOKENS_RE }; # Which headers should we scan for tokens? Don't use all of them, as it's easy @@ -226,6 +227,15 @@ # will require a longer token than English ones.) use constant MAX_TOKEN_LENGTH => 15; +# Skip if a token is too short. +$SKIP_UTF8_SHORT_TOKENS_RE = qr{(?: + [\x00-\x7F] # 1 byte + | [\xC0-\xDF][\x80-\xBF] # 2 bytes + | [\xE0-\xEF][\x80-\xBF]{2} # 3 bytes + | [\xF0-\xF7][\x80-\xBF]{3} # 4 bytes + | (?:\xE3[\x81-\x83][\x80-\xBF]){2} # 2 characters of Hiragana and Katakana +)}x; + ########################################################################### sub new { @@ -1048,9 +1058,28 @@ $pms->{msg}->get_mimepart_digests() if $t_src->{mimepart}; @{$msgdata->{bayes_token_uris}} = $pms->get_uri_list() if $t_src->{uri}; + if ($self->{conf}->{normalize_charset}) { + my $tokenizer = $self->get_tokenizer($pms); + if (ref($tokenizer)) { + $msgdata->{bayes_token_body} = $tokenizer->tokenize($msgdata->{bayes_token_body}); + $msgdata->{bayes_token_inviz} = $tokenizer->tokenize($msgdata->{bayes_token_inviz}); + } + } return $msgdata; } +sub get_tokenizer { + my ($self, $msg) = @_; + + my $tokenizer; + my @languages = split(/\s+/, $msg->{msg}->get_language()); + foreach my $lang (@languages) { + $tokenizer = $self->{'conf'}->{'tokenizer'}->{$lang}; + last if (ref($tokenizer)); + } + return $tokenizer; +} + ########################################################################### # The calling functions expect a uniq'ed array of tokens ... @@ -1192,6 +1221,11 @@ next if $len < 3 || ($token =~ /^(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))$/i); + # Skip short UTF-8 tokens. + if ($self->{conf}->{normalize_charset}) { + next if ($token =~ /^$SKIP_UTF8_SHORT_TOKENS_RE$/o); + } + # are we in the body? If so, apply some body-specific breakouts if ($region == 1 || $region == 2) { if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) { @@ -1222,14 +1256,16 @@ } } - if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) { - # Matt sez: "Could be asian? Autrijus suggested doing character ngrams, - # but I'm doing tuples to keep the dbs small(er)." Sounds like a plan - # to me! (jm) - while ($token =~ s/^(..?)//) { - push (@rettokens, "8:$1"); + unless ($self->{conf}->{normalize_charset}) { + if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) { + # Matt sez: "Could be asian? Autrijus suggested doing character ngrams, + # but I'm doing tuples to keep the dbs small(er)." Sounds like a plan + # to me! (jm) + while ($token =~ s/^(..?)//) { + push (@rettokens, "8:$1"); + } + next; } - next; } if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS) --- lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm 1970-01-01 09:00:00.000000000 +0900 +++ lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm 2015-08-30 00:25:32.537000000 +0900 @@ -0,0 +1,84 @@ +# <@LICENSE> +# Copyright 2004 Apache Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +=head1 NAME + +Tokenizer::MeCab - Japanese tokenizer with MeCab + +=head1 SYNOPSIS + +loadplugin Mail::SpamAssassin::Plugin::Tokenizer::MeCab + +=head1 DESCRIPTION + +This plugin tokenizes a Japanese string with MeCab that is +the morphological analysis engine. + +Text::MeCab 0.12 or over is required. + +=cut + +package Mail::SpamAssassin::Plugin::Tokenizer::MeCab; + +use strict; +use warnings; +use Mail::SpamAssassin::Plugin::Tokenizer; + +use vars qw(@ISA); +@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer); + +# Have to do this so that RPM doesn't find these as required perl modules +BEGIN { require MeCab; } +our $language = 'ja'; +our $mecab = new MeCab::Tagger(-Ochasen); + +sub new { + my $class = shift; + my $mailsaobject = shift; + + $class = ref($class) || $class; + my $self = $class->SUPER::new($mailsaobject, $language); + bless ($self, $class); + + return $self; +} + +sub tokenize { + my $self = shift; + my $text_array = shift; + + my @tokenized_array; + foreach my $text (@$text_array) { + next unless ($text); + $text =~ s/([\x80-\xFF]{3,})/&_tokenize($1)/eg; + push(@tokenized_array, $text); + } + return \@tokenized_array; +} + +sub _tokenize { + my $text = shift; + + my @buf; + for (my $node = $mecab->parseToNode($text); $node->{next}; $node = $node->{next}) { + push(@buf, $node->{surface}); + } + my $tokenized = join(' ', @buf) . ' '; + return $tokenized; +} + +1; + --- lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm 1970-01-01 09:00:00.000000000 +0900 +++ lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm 2015-08-30 00:25:32.538000000 +0900 @@ -0,0 +1,111 @@ +# <@LICENSE> +# Copyright 2004 Apache Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +=head1 NAME + +Tokenizer::SimpleJA - simple Japanese tokenizer + +=head1 SYNOPSIS + +loadplugin Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA + +=head1 DESCRIPTION + +This plugin simply tokenizes a Japanese string by characters other than +the alphabet, the Chinese character, and the katakana. + +=cut + +package Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA; + +use strict; +use warnings; +use Mail::SpamAssassin::Plugin::Tokenizer; + +use vars qw(@ISA); +@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer); + +our $language = 'ja'; + +our $RE = qr{( + # Hiragana + (?: + \xE3\x81[\x80-\xBF] + | \xE3\x82[\x80-\x9F] + )+ + # Katakana + | (?: + \xE3\x82[\xA0-\xBF] + | \xE3\x83[\x80-\xBF] + )+ + # Kanji + | (?: + \xE3[\x90-\xBF][\x80-\xBF] + | [\xE4-\xE9][\x80-\xBF]{2} + | \xEF[\xA4-\xAB][\x80-\xBF] + )+ + # Fullwidth + | (?: + \xEF\xBC[\x80-\xBF] + | \xEF\xBD[\x80-\x9F] + )+ + # Others + | [\xC0-\xDF][\x80-\xBF] + | [\xE0-\xE2][\x80-\xBF]{2} + | \xE3\x80[\x80-\xBF] + | \xE3[\x84-\x8F][\x80-\xBF] + | [\xEA-\xEE][\x80-\xBF]{2} + | \xEF[\x80-\xA3][\x80-\xBF] + | \xEF[\xAC-\xBB][\x80-\xBF] + | \xEF\xBD[\xA0-\xBF] + | \xEF[\xBE-\xBF][\x80-\xBF] + | [\xF0-\xF7][\x80-\xBF]{3} +)}x; + +sub new { + my $class = shift; + my $mailsaobject = shift; + + $class = ref($class) || $class; + my $self = $class->SUPER::new($mailsaobject, $language); + bless ($self, $class); + + return $self; +} + +sub tokenize { + my $self = shift; + my $text_array = shift; + + my @tokenized_array; + foreach my $text (@$text_array) { + next unless ($text); + $text =~ s/([\x80-\xFF]{3,})/&_tokenize($1)/eg; + push(@tokenized_array, $text); + } + return \@tokenized_array; +} + +sub _tokenize { + my $text = shift; + + $text =~ s/$RE/$1 /og; + $text = ' ' . $text; + return $text; +} + +1; + --- lib/Mail/SpamAssassin/Plugin/Tokenizer.pm 1970-01-01 09:00:00.000000000 +0900 +++ lib/Mail/SpamAssassin/Plugin/Tokenizer.pm 2015-08-30 00:25:32.538000000 +0900 @@ -0,0 +1,115 @@ +# <@LICENSE> +# Copyright 2004 Apache Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +=head1 NAME + +Mail::SpamAssassin::Plugin::Tokenizer - Tokenizer plugin base class + +=head1 SYNOPSIS + +=head2 SpamAssassin configuration: + + loadplugin MyTokenizerPlugin /path/to/MyTokenizerPlugin.pm + +=head2 Perl code: + + use Mail::SpamAssassin::Plugin::Tokenizer; + use vars qw(@ISA); + @ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer); + # language to use this plugin + our $language = 'ja'; + + # constructor: register language + sub new { + my $class = shift; + my $mailsaobject = shift; + + # some boilerplate... + $class = ref($class) || $class; + my $self = $class->SUPER::new($mailsaobject, $language); + bless ($self, $class); + + return $self; + } + + # tokenize function + sub tokenize { + my $self = shift; + my $text_array_ref = shift; + + ...... + + return $tokenized_array_ref; + } + + +=head1 DESCRIPTION + +This plugin is the base class of tokenizer plugin. +You must define tokenize() and $language + +=head1 INTERFACE + + sub tokenize { + my $self = shift; + my $text_array_ref = shift; + + ...... + + return $tokenized_array_ref; + } + +=cut + +package Mail::SpamAssassin::Plugin::Tokenizer; + +use Mail::SpamAssassin::Plugin; +use Mail::SpamAssassin::Logger; +use strict; +use warnings; +use bytes; + +use vars qw(@ISA); +@ISA = qw(Mail::SpamAssassin::Plugin); + +sub new { + my $class = shift; + my $mailsaobject = shift; + my $language = shift; + + # some boilerplate... + $class = ref($class) || $class; + my $self = $class->SUPER::new($mailsaobject); + bless ($self, $class); + + if ($language) { + $self->{main}->{conf}->{tokenizer}->{$language} = $self; + } + else { + dbg("plugin: $self: \$language is not defined"); + } + + return $self; +} + +sub tokenize { + my ($self, $ref) = @_; + + return $ref; +} + +1; + --- lib/Mail/SpamAssassin/Util/Charset.pm 1970-01-01 09:00:00.000000000 +0900 +++ lib/Mail/SpamAssassin/Util/Charset.pm 2015-08-30 00:25:32.539000000 +0900 @@ -0,0 +1,473 @@ +# <@LICENSE> +# Copyright 2006 Apache Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +=head1 NAME + + Mail::SpamAssassin::Util::Charset.pm - Utility for charset and language + +=head1 SYNOPSIS + + my ($decoded, $detected) = Mail::SpamAssassin::Util::Charset::normalize_charset($str, $charset); + my $language = Mail::SpamAssassin::Util::Charset::get_language($str, $charset); + +=head1 DESCRIPTION + +This module implements utility methods for charset and language. + +=cut + +package Mail::SpamAssassin::Util::Charset; + +use strict; +use warnings; +use Encode; +use Encode::Guess; +use Encode::Alias; + +use vars qw ( + @ISA @EXPORT +); + +require Exporter; + +@ISA = qw(Exporter); +@EXPORT = qw(normalize_charset get_language); + +########################################################################### + +use constant HAS_ENCODE_DETECT => eval { require Encode::Detect::Detector; }; +use constant HAS_ENCODE_HANEXTRA => eval { require Encode::HanExtra; }; +use constant HAS_ENCODE_EUCJPMS => eval { require Encode::EUCJPMS; }; + +########################################################################### + +our $KANA_HAN_RE = qr{ + # Hiragana and Katakana + \xE3[\x81-\x83][\x80-\xBF] + # Han + | \xE3[\x90-\xBF][\x80-\xBF] + | [\xE4-\xE9][\x80-\xBF]{2} + | \xEF[\xA4-\xAB][\x80-\xBF] +}x; + +our %enc2lang; +our %lang2enc; +our %scr2lang; +our %cjkscr2lang; +our @scrorder; + +BEGIN { + + # See the following URL about this map: + # http://czyborra.com/charsets/iso8859.html + # http://czyborra.com/charsets/codepages.html + # http://czyborra.com/charsets/cyrillic.html + # http://en.wikipedia.org/wiki/ISO_8859 + # http://www.w3.org/International/O-charset-lang.html + %enc2lang = ( + # buint-in Encodings and Encode::Byte + # N. America + 'ascii' => 'en', + 'cp437' => 'en', + 'cp863' => 'weurope', + + # W. Europe (Latin1, Latin9) + # fr es ca eu pt it sq rm nl de da sv no fi fo is ga gd en af + 'iso-8859-1' => 'weurope', + 'iso-8859-15' => 'weurope', + 'cp850' => 'weurope', + 'cp860' => 'weurope', + 'cp1252' => 'weurope', + 'MacRoman' => 'weurope', + + # Cntrl. Europe / Latin2 / Latin10 + # hr cs hu pl sr sk sl + 'iso-8859-2' => 'ceurope', + 'cp852' => 'ceurope', + 'cp1250' => 'ceurope', + 'MacCentralEurRoman' => 'ceurope', + 'MacCroatian' => 'ceurope', + 'iso-8859-16' => 'ceurope', + 'MacRomanian' => 'ceurope', + + # Latin3 (Esperanto, Maltese, and Turkish. Turkish is now on 8859-9.) + # eo mt + 'iso-8859-3' => 'seurope', + + # Baltics (Latin4, Latin7) + # lv lt + 'iso-8859-4' => 'neurope', + 'iso-8859-13' => 'baltic', + 'cp1257' => 'baltic', + + # Nordics (Latin6) + # et kl iu se + 'iso-8859-10' => 'nordic', + + # Cyrillics + # bg be uk sr mk ru + 'iso-8859-5' => 'ru', + 'cp855' => 'ru', + 'cp1251' => 'ru', + 'cp866' => 'ru', + 'MacCyrillic' => 'ru', + 'koi8-r' => 'ru', + 'MacUkrainian' => 'uk', + 'koi8-u' => 'uk', + + # Arabic + 'iso-8859-6' => 'ar', + 'cp864' => 'ar', + 'cp1256' => 'ar', + 'MacArabic' => 'ar', + 'cp1006' => 'fa', + 'MacFarsi' => 'fa', + + # Greek + 'iso-8859-7' => 'el', + 'cp1253' => 'el', + 'MacGreek' => 'el', + + # Hebrew + # he yi + 'iso-8859-8' => 'he', + 'cp862' => 'he', + 'cp1255' => 'he', + 'MacHebrew' => 'he', + + # Turkish + 'iso-8859-9' => 'tr', + 'cp857' => 'tr', + 'cp1254' => 'tr', + 'MacTurkish' => 'tr', + + # Thai + 'iso-8859-11' => 'th', + 'cp874' => 'th', + + # Celtics (Latin8) + # gd cy br + 'iso-8859-14' => 'celtic', + + # Vietnamese + 'viscii' => 'vi', + 'cp1258' => 'vi', + + # Encode::CN + 'euc-cn' => 'zh', + 'cp936' => 'zh', + 'hz' => 'zh', + + # Encode::TW + 'big5-eten' => 'zh', + 'big5-hkscs' => 'zh', + 'cp950' => 'zh', + + # Encode::JP + 'euc-jp' => 'ja', + 'shiftjis' => 'ja', + '7bit-jis' => 'ja', + 'iso-2022-jp' => 'ja', + 'iso-2022-jp-1' => 'ja', + 'cp932' => 'ja', + + # Encode::KR + 'euc-kr' => 'ko', + 'cp949' => 'ko', + 'johab' => 'ko', + 'iso-2022-kr' => 'ko', + + # Encode::HanExtra + 'euc-tw' => 'zh', + 'gb18030' => 'zh', + + # Encode::JIS2K + 'euc-jisx0213' => 'ja', + 'shiftjisx0123' => 'ja', + 'iso-2022-jp-3' => 'ja', + + # Encode::EUCJPMS + 'eucJP-ms' => 'ja', + 'cp51932' => 'ja', + 'cp50220' => 'ja', + 'cp50221' => 'ja', + + ); + + %lang2enc = ( + # Latin1 + 'en' => ['ascii'], + 'weurope' => ['cp1252'], + + # Latin2 + 'ceurope' => ['cp1250'], + + # Latin3 + 'seurope' => ['iso-8859-3'], + + # Latin4 + 'neurope' => ['iso-8859-4'], + + # Latin5 + 'tr' => ['cp1254'], + + # Latin6 + 'nordic' => ['iso-8859-10'], + + # Latin7 + 'baltic' => ['cp1257'], + + # Latin8 + 'celtic' => ['iso-8859-14'], + + # Non Latin + 'ru' => ['koi8-r', 'cp1251'], + 'uk' => ['koi8-u'], + + 'ar' => ['cp1256'], + 'el' => ['cp1253'], + 'he' => ['cp1255'], + 'th' => ['cp874'], + 'vi' => ['viscii', 'cp1258'], + 'zh' => ['euc-cn', 'cp950'], + 'ja' => ['euc-jp', 'cp932'], + 'ko' => ['euc-kr', 'cp949'], + + ); + + %scr2lang = ( + 'InLatin1Supplement' => ['weurope'], + 'InLatinExtendedA' => [ + 'ceurope', + 'seurope', + 'tr', + 'vi' + ], + 'InLatinExtendedB' => [ + 'nordic', + 'baltic', + 'celtic' + ], + 'Thai' => ['th'], + 'Cyrillic' => ['ru', 'uk'], + 'Arabic' => ['ar'], + 'Greek' => ['el'], + 'Hebrew' => ['he'], + ); + + # better detection for CJK + @scrorder = ('Hiragana','Katakana','Hangul','Han',keys(%scr2lang)); + %cjkscr2lang = ( + 'Hiragana' => ['ja'], + 'Katakana' => ['ja'], + 'Hangul' => ['ko'], + 'Han' => ['zh', 'ja', 'ko'], + ); + + unless (HAS_ENCODE_HANEXTRA) { + Encode::Alias::define_alias( qr/^gb18030$/i => ' "euc-cn"' ); + } + Encode::Alias::define_alias( qr/^unicode-1-1-(.+)$/i => ' "$1"' ); + Encode::Alias::define_alias( qr/^TIS-620$/i => ' "iso-8859-11"' ); + Encode::Alias::define_alias( qr/^x-mac-(.+)$/i => ' "Mac$1"' ); + Encode::Alias::define_alias( qr/^Shift_JIS$/i => ' "cp932"' ); + if (HAS_ENCODE_EUCJPMS) { + Encode::Alias::define_alias( qr/^iso-2022-jp$/i => ' "cp50221"' ); + Encode::Alias::define_alias( qr/^euc-jp$/i => ' "cp51932"' ); + } +} + +sub get_language { + my $str = shift; # $str must be UTF-8 encoding + my $charset = shift; + + return 'en' unless $charset; + if ($charset !~ /^utf/i) { + return $enc2lang{$charset}; + } elsif (defined($str)) { + $str =~ s/[\x00-\x7F]//g; # remove ASCII characters + return 'en' if ($str eq ''); + + my %handled; + $str = Encode::decode_utf8($str) unless (Encode::is_utf8($str)); + foreach my $scr (@scrorder) { + next if ($str !~ /\p{$scr}/); + my $scrlangs = exists($cjkscr2lang{$scr}) ? $cjkscr2lang{$scr} : $scr2lang{$scr}; + foreach my $lang (@$scrlangs) { + next if (exists($handled{$lang})); + foreach my $enc (@{$lang2enc{$lang}}) { + my $scratch = $str; + Encode::encode($enc, $scratch, Encode::FB_QUIET); + return $lang if ($scratch eq ''); + } + $handled{$lang} = 1; + } + } + } + return 'en'; +} + +# TEST 1: try conversion to use the specified charset. +# TEST 2: try conversion to use Encode::Detect. +# TEST 3: try conversion to use Encode::Guess. +sub normalize_charset { + my $str = shift; + my $charset = shift; + my $return_decoded = shift; + + return wantarray ? ($str, 'ascii') : $str unless ($str); + + my $decoded; + my $detected; + + if ($charset) { + ($decoded, $detected) = _specified_encoding($str, $charset); + } + unless ($detected) { + ($decoded, $detected) = _encode_detect($str); + } + unless ($detected) { + ($decoded, $detected) = _encode_guess($str); + } + unless ($detected) { + return ($str, undef); + } + $decoded =~ s/^\x{feff}//g; + $decoded = Encode::encode_utf8($decoded) if $return_decoded; + + # unfold hiragana, katakana and han + if ($detected =~ /^(?:UTF|EUC|BIG5|GB|SHIFTJIS|ISO-2022|CP969$|CP932$|CP949$|CP50220|CP50221$)/i) { + $decoded =~ s/($KANA_HAN_RE)\012($KANA_HAN_RE)/$1$2/og; + } + return wantarray ? ($decoded, $detected) : $decoded; +} + +sub _specified_encoding { + my $str = shift; + my $encoding = shift; + + my $detected; + my $decoded; + + return (undef, undef) unless ($encoding); + + # note: ISO-2022-* is not deistinguish from US-ASCII + return (undef, undef) if ($str =~ /\e/ and $encoding !~ /^ISO-2022/i); + + # UTF-16|32 encoding without BOM cannot be trusted. + return (undef, undef) if ($encoding =~ /^UTF-32$/i and $str !~ /^(?:\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/); + return (undef, undef) if ($encoding =~ /^UTF-16$/i and $str !~ /^(?:\xFF\xFE|\xFE\xFF)/); + + $encoding = _get_alias($encoding); + my $encoder = Encode::find_encoding($encoding); + if (ref($encoder)) { + $decoded = $encoder->decode($str,Encode::FB_QUIET); + $detected = $encoder->name if ($str eq ''); + } + return ($decoded, $detected); +} + +sub _encode_detect { + return undef unless HAS_ENCODE_DETECT; + my $str = shift; + + # UTF-16|32 encoding without BOM cannot be trusted. + return (undef, undef) if ($str =~ /\x00\x00/ and $str !~ /^(?:\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/); + return (undef, undef) if ($str =~ /\x00/ and $str !~ /^(?:\xFF\xFE|\xFE\xFF)/); + + my $decoded; + my $detected = Encode::Detect::Detector::detect($str); + if ($detected) { + $detected = _get_alias($detected); + my $encoder = Encode::find_encoding($detected); + if (ref($encoder)) { + $decoded = $encoder->decode($str); + $detected = $decoded ? $encoder->name : undef; + } + else { + $detected = undef; + } + } + return ($decoded, $detected); +} + +sub _encode_guess { + my $str = shift; + + my $detected; + my $decoded; + my $encoder; + + # Step 1: Examine ISO-2022-*. + if ($str =~ /\e/) { + $Encode::Guess::NoUTFAutoGuess = 1; + $encoder = Encode::Guess::guess_encoding($str, + qw/cp50221 7bit-jis iso-2022-kr/); + $Encode::Guess::NoUTFAutoGuess = 0; + } + + # Step 2: Examine US-ASCII/UTF-(8|16|32) + unless (ref($encoder)) { + $Encode::Guess::NoUTFAutoGuess = 0; + $encoder = Encode::Guess::guess_encoding($str); + } + + # Step 3: Examine other encodings + unless (ref($encoder)) { + $Encode::Guess::NoUTFAutoGuess = 1; + eval { + if ($str =~ /[\x80-\xFF]{4}/) { + $encoder = Encode::Guess::guess_encoding($str, + qw/euc-cn big5-eten euc-jp cp932 euc-kr cp949/); + } + else { + $encoder = Encode::Guess::guess_encoding($str, + qw/iso-8859-1 cp1252/); + } + }; + $Encode::Guess::NoUTFAutoGuess = 0; + } + if (ref($encoder)) { + $detected = $encoder->name; + if ($detected) { + $decoded = $encoder->decode($str); + } + } + return ($decoded, $detected); +} + +sub _get_alias { + my $encoding = shift; + + unless (HAS_ENCODE_HANEXTRA) { + $encoding =~ s/^gb18030$/euc-cn/i; + } + $encoding =~ s/^unicode-1-1-(.+)$/$1/i; + $encoding =~ s/^TIS-620$/iso-8859-11/i; + $encoding =~ s/x-mac-(.+)$/Mac$1/i; + $encoding =~ s/^Shift_JIS$/cp932/i; + if (HAS_ENCODE_EUCJPMS) { + $encoding =~ s/^iso-2022-jp$/cp50221/i; + $encoding =~ s/^euc-jp$/cp51932/i; + } + + return $encoding; +} + + +1; +