--- lib/Mail/SpamAssassin/HTML.pm 2015-04-29 04:56:49.000000000 +0900
+++ lib/Mail/SpamAssassin/HTML.pm 2015-08-30 00:46:40.902000000 +0900
@@ -695,7 +695,8 @@
}
else {
# NBSP: UTF-8: C2 A0, ISO-8859-*: A0
- $text =~ s/[ \t\n\r\f\x0b]+|\xc2\xa0/ /gs;
+ # Ideographic Space: UTF-8: E3 80 80
+ $text =~ s/[ \t\n\r\f\x0b]+|(?:\xc2\xa0)+|(?:\xe3\x80\x80)+/ /gs;
# trim leading whitespace if previous element was whitespace
# and current element is not invisible
if (@{ $self->{text} } && !$display{invisible} &&
@@ -742,7 +743,8 @@
my $invisible_for_bayes = 0;
# NBSP: UTF-8: C2 A0, ISO-8859-*: A0
- if ($text !~ /^(?:[ \t\n\r\f\x0b]|\xc2\xa0)*\z/s) {
+ # Ideographic Space: UTF-8: E3 80 80
+ if ($text !~ /^(?:[ \t\n\r\f\x0b]|\xc2\xa0|\xe3\x80\x80)*\z/s) {
$invisible_for_bayes = $self->html_font_invisible($text);
}
--- lib/Mail/SpamAssassin/Message/Node.pm 2015-04-29 04:56:48.000000000 +0900
+++ lib/Mail/SpamAssassin/Message/Node.pm 2015-08-30 00:25:32.534000000 +0900
@@ -44,6 +44,7 @@
use Mail::SpamAssassin::Constants qw(:sa);
use Mail::SpamAssassin::HTML;
use Mail::SpamAssassin::Logger;
+use Mail::SpamAssassin::Util::Charset;
our($enc_utf8, $enc_w1252, $have_encode_detector);
BEGIN {
@@ -407,6 +408,10 @@
return $_[1] unless $self->{normalize} && $enc_utf8;
+ # FIXME: to be merged.
+ my ($decoded_data, $charset_detected) = normalize_charset($_[1], $charset_declared, $return_decoded);
+ return wantarray ? ($decoded_data, $charset_detected) : $decoded_data;
+
warn "message: _normalize() was given characters, expected bytes: $_[1]\n"
if utf8::is_utf8($_[1]);
@@ -603,6 +608,7 @@
my $text = $self->decode; # QP and Base64 decoding, bytes
my $text_len = length($text); # num of bytes in original charset encoding
+ my $charset;
# render text/html always, or any other text|text/plain part as text/html
# based on a heuristic which simulates a certain common mail client
@@ -622,7 +628,9 @@
# subroutine _normalize() to return Unicode text. See Bug 7133
#
$character_semantics = 1; # $text will be in characters
- $text = $self->_normalize($text, $self->{charset}, 1); # bytes to chars
+ ($text, $charset) = $self->_normalize($text, $self->{charset}, 1); # bytes to chars
+ $self->{charset} = $charset;
+ $self->{language} = get_language($text, $charset);
} elsif (!defined $self->{charset} ||
$self->{charset} =~ /^(?:US-ASCII|UTF-8)\z/i) {
# With some luck input can be interpreted as UTF-8, do not warn.
@@ -657,7 +665,9 @@
else { # plain text
if ($self->{normalize} && $enc_utf8) {
# request transcoded result as UTF-8 octets!
- $text = $self->_normalize($text, $self->{charset}, 0);
+ ($text, $charset) = $self->_normalize($text, $self->{charset}, 0);
+ $self->{charset} = $charset;
+ $self->{language} = get_language($text, $charset);
}
$self->{rendered_type} = $self->{type};
$self->{rendered} = $self->{'visible_rendered'} = $text;
--- lib/Mail/SpamAssassin/Message.pm 2015-04-29 04:56:49.000000000 +0900
+++ lib/Mail/SpamAssassin/Message.pm 2015-08-30 00:52:32.210000000 +0900
@@ -627,6 +627,8 @@
delete $self->{'pristine_headers'};
delete $self->{'line_ending'};
delete $self->{'missing_head_body_separator'};
+ delete $self->{'charset'};
+ delete $self->{'language'};
my @toclean = ( $self );
@@ -653,6 +655,8 @@
delete $part->{'invisible_rendered'};
delete $part->{'type'};
delete $part->{'rendered_type'};
+ delete $self->{'charset'};
+ delete $self->{'language'};
# if there are children nodes, add them to the queue of nodes to clean up
if (exists $part->{'body_parts'}) {
@@ -1143,6 +1147,9 @@
# whitespace handling (warning: small changes have large effects!)
$text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed
# $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace (incl. VT, NBSP) => space
+ if ($self->{normalize}) {
+ $text =~ s/\xc2\xa0|\xe3\x80\x80/ /g; # whitespace (NBSP, ideographic space) => space
+ }
$text =~ tr/ \t\n\r\x0b/ /s; # whitespace (incl. VT) => space
$text =~ tr/\f/\n/; # form feeds => newline
@@ -1235,6 +1242,27 @@
}
# ---------------------------------------------------------------------------
+
+sub get_language {
+ my ($self) = @_;
+
+ if (defined $self->{language}) { return $self->{language}; }
+ my @parts = $self->find_parts(qr/^(?:text|message)\b/i,1);
+ return '' unless @parts;
+
+ # Go through each part
+ my @langs;
+ for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) {
+ my $p = $parts[$pt];
+ my $lang = $p->{language};
+ next unless ($lang);
+ push(@langs, $lang) unless (grep(/^$lang$/, @langs))
+ }
+ $self->{language} = scalar(@langs) ? join(' ', @langs) : '';
+ return $self->{language};
+}
+
+# ---------------------------------------------------------------------------
1;
--- lib/Mail/SpamAssassin/PerMsgStatus.pm 2015-04-29 04:56:49.000000000 +0900
+++ lib/Mail/SpamAssassin/PerMsgStatus.pm 2015-08-30 00:55:35.583000000 +0900
@@ -55,6 +55,7 @@
use Errno qw(ENOENT);
use Time::HiRes qw(time);
+use Encode;
use Mail::SpamAssassin::Constants qw(:sa);
use Mail::SpamAssassin::AsyncLoop;
@@ -1053,12 +1054,32 @@
# the SpamAssassin report
my $report = $self->get_report();
- # If there are any wide characters, need to MIME-encode in UTF-8
- # TODO: If $report_charset is something other than iso-8859-1/us-ascii, then
- # we could try converting to that charset if possible
- unless ($] < 5.008 || utf8::downgrade($report, 1)) {
+ # decode to utf-8.
+ my $is_utf8 = 0;
+ if ($self->{conf}->{normalize_charset}) {
+ $report = Encode::decode_utf8($report);
+ $is_utf8 = 1;
+ }
+ else {
+ if ($self->{msg}->{charset}) {
+ eval {
+ my $scratch = $report;
+ $report = Encode::decode($self->{msg}->{charset},$scratch,Encode::FB_CROAK);
+ $is_utf8 = 1;
+ };
+ }
+ }
+ # encode to report_charset. encode to utf-8 if charset conversion fail.
+ if ($is_utf8) {
+ eval {
+ my $scratch = $report;
+ $report = Encode::encode($self->{conf}->{report_charset},$scratch,Encode::FB_CROAK);
+ $is_utf8 = 0;
+ };
+ if ($is_utf8) {
+ $report = Encode::encode_utf8($report);
$report_charset = "; charset=utf-8";
- utf8::encode($report);
+ }
}
# get original headers, "pristine" if we can do it
--- lib/Mail/SpamAssassin/Plugin/Bayes.pm 2015-04-29 04:56:47.000000000 +0900
+++ lib/Mail/SpamAssassin/Plugin/Bayes.pm 2015-08-30 00:25:43.443000000 +0900
@@ -70,6 +70,7 @@
$MARK_PRESENCE_ONLY_HDRS
%HEADER_NAME_COMPRESSION
$OPPORTUNISTIC_LOCK_VALID
+ $SKIP_UTF8_SHORT_TOKENS_RE
};
# Which headers should we scan for tokens? Don't use all of them, as it's easy
@@ -226,6 +227,15 @@
# will require a longer token than English ones.)
use constant MAX_TOKEN_LENGTH => 15;
+# Skip if a token is too short.
+$SKIP_UTF8_SHORT_TOKENS_RE = qr{(?:
+ [\x00-\x7F] # 1 byte
+ | [\xC0-\xDF][\x80-\xBF] # 2 bytes
+ | [\xE0-\xEF][\x80-\xBF]{2} # 3 bytes
+ | [\xF0-\xF7][\x80-\xBF]{3} # 4 bytes
+ | (?:\xE3[\x81-\x83][\x80-\xBF]){2} # 2 characters of Hiragana and Katakana
+)}x;
+
###########################################################################
sub new {
@@ -1048,9 +1058,28 @@
$pms->{msg}->get_mimepart_digests() if $t_src->{mimepart};
@{$msgdata->{bayes_token_uris}} =
$pms->get_uri_list() if $t_src->{uri};
+ if ($self->{conf}->{normalize_charset}) {
+ my $tokenizer = $self->get_tokenizer($pms);
+ if (ref($tokenizer)) {
+ $msgdata->{bayes_token_body} = $tokenizer->tokenize($msgdata->{bayes_token_body});
+ $msgdata->{bayes_token_inviz} = $tokenizer->tokenize($msgdata->{bayes_token_inviz});
+ }
+ }
return $msgdata;
}
+sub get_tokenizer {
+ my ($self, $msg) = @_;
+
+ my $tokenizer;
+ my @languages = split(/\s+/, $msg->{msg}->get_language());
+ foreach my $lang (@languages) {
+ $tokenizer = $self->{'conf'}->{'tokenizer'}->{$lang};
+ last if (ref($tokenizer));
+ }
+ return $tokenizer;
+}
+
###########################################################################
# The calling functions expect a uniq'ed array of tokens ...
@@ -1192,6 +1221,11 @@
next if $len < 3 ||
($token =~ /^(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))$/i);
+ # Skip short UTF-8 tokens.
+ if ($self->{conf}->{normalize_charset}) {
+ next if ($token =~ /^$SKIP_UTF8_SHORT_TOKENS_RE$/o);
+ }
+
# are we in the body? If so, apply some body-specific breakouts
if ($region == 1 || $region == 2) {
if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) {
@@ -1222,14 +1256,16 @@
}
}
- if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
- # Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
- # but I'm doing tuples to keep the dbs small(er)." Sounds like a plan
- # to me! (jm)
- while ($token =~ s/^(..?)//) {
- push (@rettokens, "8:$1");
+ unless ($self->{conf}->{normalize_charset}) {
+ if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
+ # Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
+ # but I'm doing tuples to keep the dbs small(er)." Sounds like a plan
+ # to me! (jm)
+ while ($token =~ s/^(..?)//) {
+ push (@rettokens, "8:$1");
+ }
+ next;
}
- next;
}
if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS)
--- lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm 1970-01-01 09:00:00.000000000 +0900
+++ lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm 2015-08-30 00:25:32.537000000 +0900
@@ -0,0 +1,84 @@
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# @LICENSE>
+
+=head1 NAME
+
+Tokenizer::MeCab - Japanese tokenizer with MeCab
+
+=head1 SYNOPSIS
+
+loadplugin Mail::SpamAssassin::Plugin::Tokenizer::MeCab
+
+=head1 DESCRIPTION
+
+This plugin tokenizes a Japanese string with MeCab that is
+the morphological analysis engine.
+
+Text::MeCab 0.12 or over is required.
+
+=cut
+
+package Mail::SpamAssassin::Plugin::Tokenizer::MeCab;
+
+use strict;
+use warnings;
+use Mail::SpamAssassin::Plugin::Tokenizer;
+
+use vars qw(@ISA);
+@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer);
+
+# Have to do this so that RPM doesn't find these as required perl modules
+BEGIN { require MeCab; }
+our $language = 'ja';
+our $mecab = new MeCab::Tagger(-Ochasen);
+
+sub new {
+ my $class = shift;
+ my $mailsaobject = shift;
+
+ $class = ref($class) || $class;
+ my $self = $class->SUPER::new($mailsaobject, $language);
+ bless ($self, $class);
+
+ return $self;
+}
+
+sub tokenize {
+ my $self = shift;
+ my $text_array = shift;
+
+ my @tokenized_array;
+ foreach my $text (@$text_array) {
+ next unless ($text);
+ $text =~ s/([\x80-\xFF]{3,})/&_tokenize($1)/eg;
+ push(@tokenized_array, $text);
+ }
+ return \@tokenized_array;
+}
+
+sub _tokenize {
+ my $text = shift;
+
+ my @buf;
+ for (my $node = $mecab->parseToNode($text); $node->{next}; $node = $node->{next}) {
+ push(@buf, $node->{surface});
+ }
+ my $tokenized = join(' ', @buf) . ' ';
+ return $tokenized;
+}
+
+1;
+
--- lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm 1970-01-01 09:00:00.000000000 +0900
+++ lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm 2015-08-30 00:25:32.538000000 +0900
@@ -0,0 +1,111 @@
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# @LICENSE>
+
+=head1 NAME
+
+Tokenizer::SimpleJA - simple Japanese tokenizer
+
+=head1 SYNOPSIS
+
+loadplugin Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA
+
+=head1 DESCRIPTION
+
+This plugin simply tokenizes a Japanese string by characters other than
+the alphabet, the Chinese character, and the katakana.
+
+=cut
+
+package Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA;
+
+use strict;
+use warnings;
+use Mail::SpamAssassin::Plugin::Tokenizer;
+
+use vars qw(@ISA);
+@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer);
+
+our $language = 'ja';
+
+our $RE = qr{(
+ # Hiragana
+ (?:
+ \xE3\x81[\x80-\xBF]
+ | \xE3\x82[\x80-\x9F]
+ )+
+ # Katakana
+ | (?:
+ \xE3\x82[\xA0-\xBF]
+ | \xE3\x83[\x80-\xBF]
+ )+
+ # Kanji
+ | (?:
+ \xE3[\x90-\xBF][\x80-\xBF]
+ | [\xE4-\xE9][\x80-\xBF]{2}
+ | \xEF[\xA4-\xAB][\x80-\xBF]
+ )+
+ # Fullwidth
+ | (?:
+ \xEF\xBC[\x80-\xBF]
+ | \xEF\xBD[\x80-\x9F]
+ )+
+ # Others
+ | [\xC0-\xDF][\x80-\xBF]
+ | [\xE0-\xE2][\x80-\xBF]{2}
+ | \xE3\x80[\x80-\xBF]
+ | \xE3[\x84-\x8F][\x80-\xBF]
+ | [\xEA-\xEE][\x80-\xBF]{2}
+ | \xEF[\x80-\xA3][\x80-\xBF]
+ | \xEF[\xAC-\xBB][\x80-\xBF]
+ | \xEF\xBD[\xA0-\xBF]
+ | \xEF[\xBE-\xBF][\x80-\xBF]
+ | [\xF0-\xF7][\x80-\xBF]{3}
+)}x;
+
+sub new {
+ my $class = shift;
+ my $mailsaobject = shift;
+
+ $class = ref($class) || $class;
+ my $self = $class->SUPER::new($mailsaobject, $language);
+ bless ($self, $class);
+
+ return $self;
+}
+
+sub tokenize {
+ my $self = shift;
+ my $text_array = shift;
+
+ my @tokenized_array;
+ foreach my $text (@$text_array) {
+ next unless ($text);
+ $text =~ s/([\x80-\xFF]{3,})/&_tokenize($1)/eg;
+ push(@tokenized_array, $text);
+ }
+ return \@tokenized_array;
+}
+
+sub _tokenize {
+ my $text = shift;
+
+ $text =~ s/$RE/$1 /og;
+ $text = ' ' . $text;
+ return $text;
+}
+
+1;
+
--- lib/Mail/SpamAssassin/Plugin/Tokenizer.pm 1970-01-01 09:00:00.000000000 +0900
+++ lib/Mail/SpamAssassin/Plugin/Tokenizer.pm 2015-08-30 00:25:32.538000000 +0900
@@ -0,0 +1,115 @@
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# @LICENSE>
+
+=head1 NAME
+
+Mail::SpamAssassin::Plugin::Tokenizer - Tokenizer plugin base class
+
+=head1 SYNOPSIS
+
+=head2 SpamAssassin configuration:
+
+ loadplugin MyTokenizerPlugin /path/to/MyTokenizerPlugin.pm
+
+=head2 Perl code:
+
+ use Mail::SpamAssassin::Plugin::Tokenizer;
+ use vars qw(@ISA);
+ @ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer);
+ # language to use this plugin
+ our $language = 'ja';
+
+ # constructor: register language
+ sub new {
+ my $class = shift;
+ my $mailsaobject = shift;
+
+ # some boilerplate...
+ $class = ref($class) || $class;
+ my $self = $class->SUPER::new($mailsaobject, $language);
+ bless ($self, $class);
+
+ return $self;
+ }
+
+ # tokenize function
+ sub tokenize {
+ my $self = shift;
+ my $text_array_ref = shift;
+
+ ......
+
+ return $tokenized_array_ref;
+ }
+
+
+=head1 DESCRIPTION
+
+This plugin is the base class of tokenizer plugin.
+You must define tokenize() and $language
+
+=head1 INTERFACE
+
+ sub tokenize {
+ my $self = shift;
+ my $text_array_ref = shift;
+
+ ......
+
+ return $tokenized_array_ref;
+ }
+
+=cut
+
+package Mail::SpamAssassin::Plugin::Tokenizer;
+
+use Mail::SpamAssassin::Plugin;
+use Mail::SpamAssassin::Logger;
+use strict;
+use warnings;
+use bytes;
+
+use vars qw(@ISA);
+@ISA = qw(Mail::SpamAssassin::Plugin);
+
+sub new {
+ my $class = shift;
+ my $mailsaobject = shift;
+ my $language = shift;
+
+ # some boilerplate...
+ $class = ref($class) || $class;
+ my $self = $class->SUPER::new($mailsaobject);
+ bless ($self, $class);
+
+ if ($language) {
+ $self->{main}->{conf}->{tokenizer}->{$language} = $self;
+ }
+ else {
+ dbg("plugin: $self: \$language is not defined");
+ }
+
+ return $self;
+}
+
+sub tokenize {
+ my ($self, $ref) = @_;
+
+ return $ref;
+}
+
+1;
+
--- lib/Mail/SpamAssassin/Util/Charset.pm 1970-01-01 09:00:00.000000000 +0900
+++ lib/Mail/SpamAssassin/Util/Charset.pm 2015-08-30 00:25:32.539000000 +0900
@@ -0,0 +1,473 @@
+# <@LICENSE>
+# Copyright 2006 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# @LICENSE>
+
+
+=head1 NAME
+
+ Mail::SpamAssassin::Util::Charset.pm - Utility for charset and language
+
+=head1 SYNOPSIS
+
+ my ($decoded, $detected) = Mail::SpamAssassin::Util::Charset::normalize_charset($str, $charset);
+ my $language = Mail::SpamAssassin::Util::Charset::get_language($str, $charset);
+
+=head1 DESCRIPTION
+
+This module implements utility methods for charset and language.
+
+=cut
+
+package Mail::SpamAssassin::Util::Charset;
+
+use strict;
+use warnings;
+use Encode;
+use Encode::Guess;
+use Encode::Alias;
+
+use vars qw (
+ @ISA @EXPORT
+);
+
+require Exporter;
+
+@ISA = qw(Exporter);
+@EXPORT = qw(normalize_charset get_language);
+
+###########################################################################
+
+use constant HAS_ENCODE_DETECT => eval { require Encode::Detect::Detector; };
+use constant HAS_ENCODE_HANEXTRA => eval { require Encode::HanExtra; };
+use constant HAS_ENCODE_EUCJPMS => eval { require Encode::EUCJPMS; };
+
+###########################################################################
+
+our $KANA_HAN_RE = qr{
+ # Hiragana and Katakana
+ \xE3[\x81-\x83][\x80-\xBF]
+ # Han
+ | \xE3[\x90-\xBF][\x80-\xBF]
+ | [\xE4-\xE9][\x80-\xBF]{2}
+ | \xEF[\xA4-\xAB][\x80-\xBF]
+}x;
+
+our %enc2lang;
+our %lang2enc;
+our %scr2lang;
+our %cjkscr2lang;
+our @scrorder;
+
+BEGIN {
+
+ # See the following URL about this map:
+ # http://czyborra.com/charsets/iso8859.html
+ # http://czyborra.com/charsets/codepages.html
+ # http://czyborra.com/charsets/cyrillic.html
+ # http://en.wikipedia.org/wiki/ISO_8859
+ # http://www.w3.org/International/O-charset-lang.html
+ %enc2lang = (
+ # buint-in Encodings and Encode::Byte
+ # N. America
+ 'ascii' => 'en',
+ 'cp437' => 'en',
+ 'cp863' => 'weurope',
+
+ # W. Europe (Latin1, Latin9)
+ # fr es ca eu pt it sq rm nl de da sv no fi fo is ga gd en af
+ 'iso-8859-1' => 'weurope',
+ 'iso-8859-15' => 'weurope',
+ 'cp850' => 'weurope',
+ 'cp860' => 'weurope',
+ 'cp1252' => 'weurope',
+ 'MacRoman' => 'weurope',
+
+ # Cntrl. Europe / Latin2 / Latin10
+ # hr cs hu pl sr sk sl
+ 'iso-8859-2' => 'ceurope',
+ 'cp852' => 'ceurope',
+ 'cp1250' => 'ceurope',
+ 'MacCentralEurRoman' => 'ceurope',
+ 'MacCroatian' => 'ceurope',
+ 'iso-8859-16' => 'ceurope',
+ 'MacRomanian' => 'ceurope',
+
+ # Latin3 (Esperanto, Maltese, and Turkish. Turkish is now on 8859-9.)
+ # eo mt
+ 'iso-8859-3' => 'seurope',
+
+ # Baltics (Latin4, Latin7)
+ # lv lt
+ 'iso-8859-4' => 'neurope',
+ 'iso-8859-13' => 'baltic',
+ 'cp1257' => 'baltic',
+
+ # Nordics (Latin6)
+ # et kl iu se
+ 'iso-8859-10' => 'nordic',
+
+ # Cyrillics
+ # bg be uk sr mk ru
+ 'iso-8859-5' => 'ru',
+ 'cp855' => 'ru',
+ 'cp1251' => 'ru',
+ 'cp866' => 'ru',
+ 'MacCyrillic' => 'ru',
+ 'koi8-r' => 'ru',
+ 'MacUkrainian' => 'uk',
+ 'koi8-u' => 'uk',
+
+ # Arabic
+ 'iso-8859-6' => 'ar',
+ 'cp864' => 'ar',
+ 'cp1256' => 'ar',
+ 'MacArabic' => 'ar',
+ 'cp1006' => 'fa',
+ 'MacFarsi' => 'fa',
+
+ # Greek
+ 'iso-8859-7' => 'el',
+ 'cp1253' => 'el',
+ 'MacGreek' => 'el',
+
+ # Hebrew
+ # he yi
+ 'iso-8859-8' => 'he',
+ 'cp862' => 'he',
+ 'cp1255' => 'he',
+ 'MacHebrew' => 'he',
+
+ # Turkish
+ 'iso-8859-9' => 'tr',
+ 'cp857' => 'tr',
+ 'cp1254' => 'tr',
+ 'MacTurkish' => 'tr',
+
+ # Thai
+ 'iso-8859-11' => 'th',
+ 'cp874' => 'th',
+
+ # Celtics (Latin8)
+ # gd cy br
+ 'iso-8859-14' => 'celtic',
+
+ # Vietnamese
+ 'viscii' => 'vi',
+ 'cp1258' => 'vi',
+
+ # Encode::CN
+ 'euc-cn' => 'zh',
+ 'cp936' => 'zh',
+ 'hz' => 'zh',
+
+ # Encode::TW
+ 'big5-eten' => 'zh',
+ 'big5-hkscs' => 'zh',
+ 'cp950' => 'zh',
+
+ # Encode::JP
+ 'euc-jp' => 'ja',
+ 'shiftjis' => 'ja',
+ '7bit-jis' => 'ja',
+ 'iso-2022-jp' => 'ja',
+ 'iso-2022-jp-1' => 'ja',
+ 'cp932' => 'ja',
+
+ # Encode::KR
+ 'euc-kr' => 'ko',
+ 'cp949' => 'ko',
+ 'johab' => 'ko',
+ 'iso-2022-kr' => 'ko',
+
+ # Encode::HanExtra
+ 'euc-tw' => 'zh',
+ 'gb18030' => 'zh',
+
+ # Encode::JIS2K
+ 'euc-jisx0213' => 'ja',
+ 'shiftjisx0123' => 'ja',
+ 'iso-2022-jp-3' => 'ja',
+
+ # Encode::EUCJPMS
+ 'eucJP-ms' => 'ja',
+ 'cp51932' => 'ja',
+ 'cp50220' => 'ja',
+ 'cp50221' => 'ja',
+
+ );
+
+ %lang2enc = (
+ # Latin1
+ 'en' => ['ascii'],
+ 'weurope' => ['cp1252'],
+
+ # Latin2
+ 'ceurope' => ['cp1250'],
+
+ # Latin3
+ 'seurope' => ['iso-8859-3'],
+
+ # Latin4
+ 'neurope' => ['iso-8859-4'],
+
+ # Latin5
+ 'tr' => ['cp1254'],
+
+ # Latin6
+ 'nordic' => ['iso-8859-10'],
+
+ # Latin7
+ 'baltic' => ['cp1257'],
+
+ # Latin8
+ 'celtic' => ['iso-8859-14'],
+
+ # Non Latin
+ 'ru' => ['koi8-r', 'cp1251'],
+ 'uk' => ['koi8-u'],
+
+ 'ar' => ['cp1256'],
+ 'el' => ['cp1253'],
+ 'he' => ['cp1255'],
+ 'th' => ['cp874'],
+ 'vi' => ['viscii', 'cp1258'],
+ 'zh' => ['euc-cn', 'cp950'],
+ 'ja' => ['euc-jp', 'cp932'],
+ 'ko' => ['euc-kr', 'cp949'],
+
+ );
+
+ %scr2lang = (
+ 'InLatin1Supplement' => ['weurope'],
+ 'InLatinExtendedA' => [
+ 'ceurope',
+ 'seurope',
+ 'tr',
+ 'vi'
+ ],
+ 'InLatinExtendedB' => [
+ 'nordic',
+ 'baltic',
+ 'celtic'
+ ],
+ 'Thai' => ['th'],
+ 'Cyrillic' => ['ru', 'uk'],
+ 'Arabic' => ['ar'],
+ 'Greek' => ['el'],
+ 'Hebrew' => ['he'],
+ );
+
+ # better detection for CJK
+ @scrorder = ('Hiragana','Katakana','Hangul','Han',keys(%scr2lang));
+ %cjkscr2lang = (
+ 'Hiragana' => ['ja'],
+ 'Katakana' => ['ja'],
+ 'Hangul' => ['ko'],
+ 'Han' => ['zh', 'ja', 'ko'],
+ );
+
+ unless (HAS_ENCODE_HANEXTRA) {
+ Encode::Alias::define_alias( qr/^gb18030$/i => ' "euc-cn"' );
+ }
+ Encode::Alias::define_alias( qr/^unicode-1-1-(.+)$/i => ' "$1"' );
+ Encode::Alias::define_alias( qr/^TIS-620$/i => ' "iso-8859-11"' );
+ Encode::Alias::define_alias( qr/^x-mac-(.+)$/i => ' "Mac$1"' );
+ Encode::Alias::define_alias( qr/^Shift_JIS$/i => ' "cp932"' );
+ if (HAS_ENCODE_EUCJPMS) {
+ Encode::Alias::define_alias( qr/^iso-2022-jp$/i => ' "cp50221"' );
+ Encode::Alias::define_alias( qr/^euc-jp$/i => ' "cp51932"' );
+ }
+}
+
+sub get_language {
+ my $str = shift; # $str must be UTF-8 encoding
+ my $charset = shift;
+
+ return 'en' unless $charset;
+ if ($charset !~ /^utf/i) {
+ return $enc2lang{$charset};
+ } elsif (defined($str)) {
+ $str =~ s/[\x00-\x7F]//g; # remove ASCII characters
+ return 'en' if ($str eq '');
+
+ my %handled;
+ $str = Encode::decode_utf8($str) unless (Encode::is_utf8($str));
+ foreach my $scr (@scrorder) {
+ next if ($str !~ /\p{$scr}/);
+ my $scrlangs = exists($cjkscr2lang{$scr}) ? $cjkscr2lang{$scr} : $scr2lang{$scr};
+ foreach my $lang (@$scrlangs) {
+ next if (exists($handled{$lang}));
+ foreach my $enc (@{$lang2enc{$lang}}) {
+ my $scratch = $str;
+ Encode::encode($enc, $scratch, Encode::FB_QUIET);
+ return $lang if ($scratch eq '');
+ }
+ $handled{$lang} = 1;
+ }
+ }
+ }
+ return 'en';
+}
+
+# TEST 1: try conversion to use the specified charset.
+# TEST 2: try conversion to use Encode::Detect.
+# TEST 3: try conversion to use Encode::Guess.
+sub normalize_charset {
+ my $str = shift;
+ my $charset = shift;
+ my $return_decoded = shift;
+
+ return wantarray ? ($str, 'ascii') : $str unless ($str);
+
+ my $decoded;
+ my $detected;
+
+ if ($charset) {
+ ($decoded, $detected) = _specified_encoding($str, $charset);
+ }
+ unless ($detected) {
+ ($decoded, $detected) = _encode_detect($str);
+ }
+ unless ($detected) {
+ ($decoded, $detected) = _encode_guess($str);
+ }
+ unless ($detected) {
+ return ($str, undef);
+ }
+ $decoded =~ s/^\x{feff}//g;
+ $decoded = Encode::encode_utf8($decoded) if $return_decoded;
+
+ # unfold hiragana, katakana and han
+ if ($detected =~ /^(?:UTF|EUC|BIG5|GB|SHIFTJIS|ISO-2022|CP969$|CP932$|CP949$|CP50220|CP50221$)/i) {
+ $decoded =~ s/($KANA_HAN_RE)\012($KANA_HAN_RE)/$1$2/og;
+ }
+ return wantarray ? ($decoded, $detected) : $decoded;
+}
+
+sub _specified_encoding {
+ my $str = shift;
+ my $encoding = shift;
+
+ my $detected;
+ my $decoded;
+
+ return (undef, undef) unless ($encoding);
+
+ # note: ISO-2022-* is not deistinguish from US-ASCII
+ return (undef, undef) if ($str =~ /\e/ and $encoding !~ /^ISO-2022/i);
+
+ # UTF-16|32 encoding without BOM cannot be trusted.
+ return (undef, undef) if ($encoding =~ /^UTF-32$/i and $str !~ /^(?:\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/);
+ return (undef, undef) if ($encoding =~ /^UTF-16$/i and $str !~ /^(?:\xFF\xFE|\xFE\xFF)/);
+
+ $encoding = _get_alias($encoding);
+ my $encoder = Encode::find_encoding($encoding);
+ if (ref($encoder)) {
+ $decoded = $encoder->decode($str,Encode::FB_QUIET);
+ $detected = $encoder->name if ($str eq '');
+ }
+ return ($decoded, $detected);
+}
+
+sub _encode_detect {
+ return undef unless HAS_ENCODE_DETECT;
+ my $str = shift;
+
+ # UTF-16|32 encoding without BOM cannot be trusted.
+ return (undef, undef) if ($str =~ /\x00\x00/ and $str !~ /^(?:\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/);
+ return (undef, undef) if ($str =~ /\x00/ and $str !~ /^(?:\xFF\xFE|\xFE\xFF)/);
+
+ my $decoded;
+ my $detected = Encode::Detect::Detector::detect($str);
+ if ($detected) {
+ $detected = _get_alias($detected);
+ my $encoder = Encode::find_encoding($detected);
+ if (ref($encoder)) {
+ $decoded = $encoder->decode($str);
+ $detected = $decoded ? $encoder->name : undef;
+ }
+ else {
+ $detected = undef;
+ }
+ }
+ return ($decoded, $detected);
+}
+
+sub _encode_guess {
+ my $str = shift;
+
+ my $detected;
+ my $decoded;
+ my $encoder;
+
+ # Step 1: Examine ISO-2022-*.
+ if ($str =~ /\e/) {
+ $Encode::Guess::NoUTFAutoGuess = 1;
+ $encoder = Encode::Guess::guess_encoding($str,
+ qw/cp50221 7bit-jis iso-2022-kr/);
+ $Encode::Guess::NoUTFAutoGuess = 0;
+ }
+
+ # Step 2: Examine US-ASCII/UTF-(8|16|32)
+ unless (ref($encoder)) {
+ $Encode::Guess::NoUTFAutoGuess = 0;
+ $encoder = Encode::Guess::guess_encoding($str);
+ }
+
+ # Step 3: Examine other encodings
+ unless (ref($encoder)) {
+ $Encode::Guess::NoUTFAutoGuess = 1;
+ eval {
+ if ($str =~ /[\x80-\xFF]{4}/) {
+ $encoder = Encode::Guess::guess_encoding($str,
+ qw/euc-cn big5-eten euc-jp cp932 euc-kr cp949/);
+ }
+ else {
+ $encoder = Encode::Guess::guess_encoding($str,
+ qw/iso-8859-1 cp1252/);
+ }
+ };
+ $Encode::Guess::NoUTFAutoGuess = 0;
+ }
+ if (ref($encoder)) {
+ $detected = $encoder->name;
+ if ($detected) {
+ $decoded = $encoder->decode($str);
+ }
+ }
+ return ($decoded, $detected);
+}
+
+sub _get_alias {
+ my $encoding = shift;
+
+ unless (HAS_ENCODE_HANEXTRA) {
+ $encoding =~ s/^gb18030$/euc-cn/i;
+ }
+ $encoding =~ s/^unicode-1-1-(.+)$/$1/i;
+ $encoding =~ s/^TIS-620$/iso-8859-11/i;
+ $encoding =~ s/x-mac-(.+)$/Mac$1/i;
+ $encoding =~ s/^Shift_JIS$/cp932/i;
+ if (HAS_ENCODE_EUCJPMS) {
+ $encoding =~ s/^iso-2022-jp$/cp50221/i;
+ $encoding =~ s/^euc-jp$/cp51932/i;
+ }
+
+ return $encoding;
+}
+
+
+1;
+