diff options
| author | Manuel Traut <manut@mecka.net> | 2011-04-29 09:09:27 +0200 |
|---|---|---|
| committer | Manuel Traut <manut@mecka.net> | 2011-04-29 09:09:27 +0200 |
| commit | 5238ad5a0c4a9e1c8cd036f5de4055e39bd71297 (patch) | |
| tree | 4407c087b9fb5432b1dc11e70b52dacfa0b99feb /beagle/debian-rfs/usr/share/perl5/Text | |
| parent | 60ead65c41afba7e6aa4bbcf507a1d52f7a8fe9f (diff) | |
added debootstrap stuff
Signed-off-by: Manuel Traut <manut@mecka.net>
Diffstat (limited to 'beagle/debian-rfs/usr/share/perl5/Text')
| -rw-r--r-- | beagle/debian-rfs/usr/share/perl5/Text/WrapI18N.pm | 239 |
1 files changed, 239 insertions, 0 deletions
diff --git a/beagle/debian-rfs/usr/share/perl5/Text/WrapI18N.pm b/beagle/debian-rfs/usr/share/perl5/Text/WrapI18N.pm new file mode 100644 index 0000000..4d9d73c --- /dev/null +++ b/beagle/debian-rfs/usr/share/perl5/Text/WrapI18N.pm @@ -0,0 +1,239 @@ +package Text::WrapI18N; + +require Exporter; +use strict; +use warnings; + +our @ISA = qw(Exporter); +our @EXPORT = qw(wrap); +our @EXPORT_OK = qw($columns $separator); +our %EXPORT_TAGS = ('all' => [ @EXPORT, @EXPORT_OK ]); + +our $VERSION = '0.06'; + +use vars qw($columns $break $tabstop $separator $huge $unexpand $charmap); +use Text::CharWidth qw(mbswidth mblen); + +BEGIN { + $columns = 76; + # $break, $separator, $huge, and $unexpand are not supported yet. + $break = '\s'; + $tabstop = 8; + $separator = "\n"; + $huge = 'wrap'; + $unexpand = 1; + undef $charmap; +} + +sub wrap { + my $top1=shift; + my $top2=shift; + my $text=shift; + + $text = $top1 . $text; + + # $out already-formatted text for output including current line + # $len visible width of the current line without the current word + # $word the current word which might be sent to the next line + # $wlen visible width of the current word + # $c the current character + # $b whether to allow line-breaking after the current character + # $cont_lf true when LF (line feed) characters appear continuously + # $w visible width of the current character + + my $out = ''; + my $len = 0; + my $word = ''; + my $wlen = 0; + my $cont_lf = 0; + my ($c, $w, $b); + $text =~ s/\n+$/\n/; + while(1) { + if (length($text) == 0) { + return $out . $word; + } + ($c, $text, $w, $b) = _extract($text); + if ($c eq "\n") { + $out .= $word . $separator; + if (length($text) == 0) {return $out;} + $len = 0; + $text = $top2 . $text; + $word = '' ; $wlen = 0; + next; + } elsif ($w == -1) { + # all control characters other than LF are ignored + next; + } + + # when the current line have enough room + # for the curren character + + if ($len + $wlen + $w <= $columns) { + if ($c eq ' ' || $b) { + $out .= $word . $c; + $len += $wlen + $w; + $word = ''; $wlen = 0; + } else { + $word .= $c; $wlen += $w; + } + next; + } + + # when the current line overflows with the + # current character + + if ($c eq ' ') { + # the line ends by space + $out .= $word . $separator; + $len = 0; + $text = $top2 . $text; + $word = ''; $wlen = 0; + } elsif ($wlen + $w <= $columns - length ($top2)) { + # the current word is sent to next line + $out .= $separator; + $len = 0; + $text = $top2 . $word . $c . $text; + $word = ''; $wlen = 0; + } else { + # the current word is too long to fit a line + $out .= $word . $separator; + $len = 0; + $text = $top2 . $c . $text; + $word = ''; $wlen = 0; + } + } +} + + +# Extract one character from the beginning from the given string. +# Supports multibyte encodings such as UTF-8, EUC-JP, EUC-KR, +# GB2312, and Big5. +# +# return value: (character, rest string, width, line breakable) +# character: a character. This may consist from multiple bytes. +# rest string: given string without the extracted character. +# width: number of columns which the character occupies on screen. +# line breakable: true if the character allows line break after it. + +sub _extract { + my $string=shift; + my ($l, $c, $r, $w, $b, $u); + + if (length($string) == 0) { + return ('', '', 0, 0); + } + $l = mblen($string); + if ($l == 0 || $l == -1) { + return ('?', substr($string,1), 1, 0); + } + $c = substr($string, 0, $l); + $r = substr($string, $l); + $w = mbswidth($c); + + if (!defined($charmap)) { + $charmap = `/usr/bin/locale charmap`; + } + + if ($charmap =~ /UTF.8/i) { + # UTF-8 + if ($l == 3) { + # U+0800 - U+FFFF + $u = (ord(substr($c,0,1))&0x0f) * 0x1000 + + (ord(substr($c,1,1))&0x3f) * 0x40 + + (ord(substr($c,2,1))&0x3f); + $b = _isCJ($u); + } elsif ($l == 4) { + # U+10000 - U+10FFFF + $u = (ord(substr($c,0,1))&7) * 0x40000 + + (ord(substr($c,1,1))&0x3f) * 0x1000 + + (ord(substr($c,2,1))&0x3f) * 0x40 + + (ord(substr($c,3,1))&0x3f); + $b = _isCJ($u); + } else { + $b = 0; + } + } elsif ($charmap =~ /(^EUC)|(^GB)|(^BIG)/i) { + # East Asian legacy encodings + # (EUC-JP, EUC-KR, GB2312, Big5, Big5HKSCS, and so on) + + if (ord(substr($c,0,1)) >= 0x80) {$b = 1;} else {$b = 0;} + } else { + $b = 0; + } + return ($c, $r, $w, $b); +} + +# Returns 1 for Chinese and Japanese characters. This means that +# these characters allow line wrapping after this character even +# without whitespaces because these languages don't use whitespaces +# between words. +# +# Character must be given in UCS-4 codepoint value. + +sub _isCJ { + my $u=shift; + + if ($u >= 0x3000 && $u <= 0x312f) { + if ($u == 0x300a || $u == 0x300c || $u == 0x300e || + $u == 0x3010 || $u == 0x3014 || $u == 0x3016 || + $u == 0x3018 || $u == 0x301a) {return 0;} + return 1; + } # CJK punctuations, Hiragana, Katakana, Bopomofo + if ($u >= 0x31a0 && $u <= 0x31bf) {return 1;} # Bopomofo + if ($u >= 0x31f0 && $u <= 0x31ff) {return 1;} # Katakana extension + if ($u >= 0x3400 && $u <= 0x9fff) {return 1;} # Han Ideogram + if ($u >= 0xf900 && $u <= 0xfaff) {return 1;} # Han Ideogram + if ($u >= 0x20000 && $u <= 0x2ffff) {return 1;} # Han Ideogram + + return 0; +} + +1; +__END__ + +=head1 NAME + +Text::WrapI18N - Line wrapping module with support for multibyte, fullwidth, +and combining characters and languages without whitespaces between words + +=head1 SYNOPSIS + + use Text::WrapI18N qw(wrap $columns); + wrap(firstheader, nextheader, texts); + +=head1 DESCRIPTION + +This module intends to be a better Text::Wrap module. +This module is needed to support multibyte character encodings such +as UTF-8, EUC-JP, EUC-KR, GB2312, and Big5. This module also supports +characters with irregular widths, such as combining characters (which +occupy zero columns on terminal, like diacritical marks in UTF-8) and +fullwidth characters (which occupy two columns on terminal, like most +of east Asian characters). Also, minimal handling of languages which +doesn't use whitespaces between words (like Chinese and Japanese) is +supported. + +Like Text::Wrap, hyphenation and "kinsoku" processing are not supported, +to keep simplicity. + +I<wrap(firstheader, nextheader, texts)> is the main subroutine of +Text::WrapI18N module to execute the line wrapping. Input parameters +and output data emulate Text::Wrap. The texts have to be written in +locale encoding. + +=head1 SEE ALSO + +locale(5), utf-8(7), charsets(7) + +=head1 AUTHOR + +Tomohiro KUBOTA, E<lt>kubota@debian.orgE<gt> + +=head1 COPYRIGHT AND LICENSE + +Copyright 2003 by Tomohiro KUBOTA + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut |
