| [adaa778] | 1 | #!/usr/bin/perl -w | 
|---|
 | 2 | require 5.004; | 
|---|
 | 3 | use strict; | 
|---|
 | 4 |  | 
|---|
 | 5 | use integer; | 
|---|
 | 6 |  | 
|---|
| [d5bd3a7] | 7 | # messages >= this value are written to a header file | 
|---|
 | 8 | my $dontextract_threshold = 1000; | 
|---|
 | 9 |  | 
|---|
| [adaa778] | 10 | # Magic identifier (12 bytes) | 
|---|
| [706b033] | 11 | my $magic = "Svx\nMsg\r\n\xfe\xff\0"; | 
|---|
| [adaa778] | 12 | # Designed to be corrupted by ASCII ftp, top bit stripping (or | 
|---|
 | 13 | # being used for parity).  Contains a zero byte so more likely | 
|---|
 | 14 | # to be flagged as data (e.g. by perl's "-B" test). | 
|---|
 | 15 |  | 
|---|
 | 16 | my $major = 0; | 
|---|
 | 17 | my $minor = 8; | 
|---|
 | 18 |  | 
|---|
 | 19 | # File format (multi-byte numbers in network order (bigendian)): | 
|---|
 | 20 | # 12 bytes: Magic identifier | 
|---|
 | 21 | # 1 byte:   File format major version number (0) | 
|---|
 | 22 | # 1 byte:   File format minor version number (8) | 
|---|
 | 23 | # 2 bytes:  Number of messages (N) | 
|---|
 | 24 | # 4 bytes:  Offset from XXXX to end of file | 
|---|
 | 25 | # XXXX: | 
|---|
 | 26 | # N*: | 
|---|
 | 27 | # <message> NUL | 
|---|
 | 28 |  | 
|---|
 | 29 | my %ent = (); | 
|---|
 | 30 |  | 
|---|
| [abd7d47] | 31 | open ENT, ($ENV{srcdir}?"$ENV{srcdir}/":"")."named-entities.txt" or die $!; | 
|---|
| [adaa778] | 32 | while (<ENT>) { | 
|---|
 | 33 |    my ($e, $v) = /^(\w+),(\d+)/; | 
|---|
 | 34 |    $ent{$e} = $v; | 
|---|
 | 35 | } | 
|---|
 | 36 | close ENT; | 
|---|
 | 37 |  | 
|---|
 | 38 | my %msgs = (); | 
|---|
| [d5bd3a7] | 39 | my %dontextract = (); | 
|---|
| [adaa778] | 40 |  | 
|---|
| [2083f8b] | 41 | my %raw = (); | 
|---|
 | 42 | my $raw = ''; | 
|---|
 | 43 | my $raw_comments = ''; | 
|---|
 | 44 | my $curmsg = -1; | 
|---|
 | 45 |  | 
|---|
| [adaa778] | 46 | while (<>) { | 
|---|
| [2083f8b] | 47 |    if (/^\s*#/) { | 
|---|
 | 48 |        $raw_comments .= $_; | 
|---|
 | 49 |        next; # skip comments | 
|---|
 | 50 |    } | 
|---|
 | 51 |  | 
|---|
| [adaa778] | 52 |    # en:  0 0.81 the message | 
|---|
 | 53 |    # en-us: 0 0.81 " the message " | 
|---|
| [eb695cd] | 54 |    my ($langs, $msgno, $dummy, $msg) = /^([-\w,]+):\s*(\d+)\s+("?)(.*)\3/; | 
|---|
| [adaa778] | 55 |  | 
|---|
| [2083f8b] | 56 |    if ($msgno != $curmsg) { | 
|---|
 | 57 |        $raw{$curmsg} = $raw; | 
|---|
 | 58 |        $raw = ''; | 
|---|
 | 59 |        $curmsg = $msgno; | 
|---|
 | 60 |    }     | 
|---|
 | 61 |  | 
|---|
 | 62 |    $raw .= $raw_comments . $_; | 
|---|
 | 63 |    $raw_comments = ''; | 
|---|
 | 64 |  | 
|---|
| [eb695cd] | 65 |    unless (defined $langs) { | 
|---|
| [adaa778] | 66 |       chomp; | 
|---|
 | 67 |       print STDERR "Warning: Bad line: \"$_\"\n"; | 
|---|
 | 68 |       next; | 
|---|
 | 69 |    } | 
|---|
 | 70 |  | 
|---|
| [c0a9908] | 71 |    $langs =~ tr/-/_/; | 
|---|
 | 72 |  | 
|---|
| [adaa778] | 73 |    if ($msg =~ /[\0-\x1f\x7f-\xff]/) { | 
|---|
 | 74 |       print STDERR "Warning: literal character in message $msgno\n"; | 
|---|
 | 75 |    } | 
|---|
 | 76 |  | 
|---|
| [eb695cd] | 77 |    my $utf8 = string_to_utf8($msg); | 
|---|
 | 78 |    for (split /,/, $langs) { | 
|---|
| [d5bd3a7] | 79 |       if ($msgno >= $dontextract_threshold) { | 
|---|
 | 80 |          ${$dontextract{$_}}[$msgno - $dontextract_threshold] = $utf8; | 
|---|
 | 81 |       } else { | 
|---|
 | 82 |          ${$msgs{$_}}[$msgno] = $utf8; | 
|---|
 | 83 |       } | 
|---|
| [eb695cd] | 84 |    } | 
|---|
| [adaa778] | 85 | } | 
|---|
| [9e9d0a4] | 86 | $raw{$curmsg} = $raw; | 
|---|
| [adaa778] | 87 |  | 
|---|
 | 88 | my $lang; | 
|---|
| [c0a9908] | 89 | my @langs = sort grep ! /_\*$/, keys %msgs; | 
|---|
| [adaa778] | 90 |  | 
|---|
 | 91 | my $num_msgs = -1; | 
|---|
 | 92 | foreach $lang (@langs) { | 
|---|
 | 93 |    my $aref = $msgs{$lang}; | 
|---|
 | 94 |    $num_msgs = scalar @$aref if scalar @$aref > $num_msgs; | 
|---|
 | 95 | } | 
|---|
 | 96 |  | 
|---|
 | 97 | foreach $lang (@langs) { | 
|---|
| [c0a9908] | 98 |    my $fnm = $lang; | 
|---|
 | 99 |    $fnm =~ s/(_.*)$/\U$1/; | 
|---|
 | 100 |    open OUT, ">$fnm.msg" or die $!; | 
|---|
| [adaa778] | 101 |     | 
|---|
 | 102 |    my $aref = $msgs{$lang}; | 
|---|
 | 103 |   | 
|---|
 | 104 |    my $parentaref; | 
|---|
 | 105 |    my $mainlang = $lang; | 
|---|
| [c0a9908] | 106 |    $parentaref = $msgs{$mainlang} if $mainlang =~ s/_.*$//; | 
|---|
| [adaa778] | 107 |  | 
|---|
 | 108 |    print OUT $magic or die $!; | 
|---|
 | 109 |    print OUT pack("CCn", $major, $minor, $num_msgs) or die $!; | 
|---|
 | 110 |  | 
|---|
 | 111 |    my $buff = ''; | 
|---|
 | 112 |  | 
|---|
 | 113 |    my $n; | 
|---|
 | 114 |    for $n (0 .. $num_msgs - 1) { | 
|---|
| [79c5f17] | 115 |       my $warned = 0; | 
|---|
| [adaa778] | 116 |       my $msg = $$aref[$n]; | 
|---|
 | 117 |       if (!defined $msg) { | 
|---|
 | 118 |          $msg = $$parentaref[$n] if defined $parentaref; | 
|---|
 | 119 |          if (!defined $msg) { | 
|---|
 | 120 |             $msg = ${$msgs{'en'}}[$n]; | 
|---|
| [abd7d47] | 121 |             # don't report if we have a parent (as the omission will be | 
|---|
 | 122 |             # reported there) | 
|---|
| [2083f8b] | 123 |             if (defined $msg && $msg ne '' && !defined $parentaref) { | 
|---|
 | 124 |                print STDERR "Warning: message $n not in language $lang\n"; | 
|---|
 | 125 |                open TODO, ">>$lang.todo" or die $!; | 
|---|
 | 126 |                print TODO $raw{$n}, "\n"; | 
|---|
 | 127 |                close TODO; | 
|---|
| [79c5f17] | 128 |                $warned = 1; | 
|---|
| [adaa778] | 129 |             } else { | 
|---|
| [abd7d47] | 130 |                $msg = '' if !defined $msg; | 
|---|
| [adaa778] | 131 |             } | 
|---|
 | 132 |          } | 
|---|
 | 133 |       } | 
|---|
| [acab4b4] | 134 |       if (defined($raw{$n}) | 
|---|
 | 135 |           && $raw{$n} =~ /^\s*#\s*TRANSLATE\b[- \ta-z]*\b$lang\b/m) { | 
|---|
| [79c5f17] | 136 |          if ($warned) { | 
|---|
| [abd7d47] | 137 |             print STDERR "Warning: message $n missing and also marked for retranslation for language $lang\n"; | 
|---|
| [79c5f17] | 138 |          } else { | 
|---|
 | 139 |             print STDERR "Warning: message $n needs retranslating for language $lang\n"; | 
|---|
 | 140 |             open TODO, ">>$lang.todo" or die $!; | 
|---|
 | 141 |             print TODO $raw{$n}, "\n"; | 
|---|
 | 142 |             close TODO; | 
|---|
 | 143 |          } | 
|---|
 | 144 |       } | 
|---|
| [adaa778] | 145 |       $buff .= $msg . "\0"; | 
|---|
 | 146 |    } | 
|---|
 | 147 |     | 
|---|
 | 148 |    print OUT pack('N',length($buff)), $buff or die $!; | 
|---|
 | 149 |    close OUT or die $!; | 
|---|
 | 150 | } | 
|---|
 | 151 |  | 
|---|
| [d5bd3a7] | 152 | my $num_dontextract = -1; | 
|---|
 | 153 | foreach $lang (@langs) { | 
|---|
 | 154 |    my $aref = $dontextract{$lang}; | 
|---|
 | 155 |    if (defined(@$aref)) { | 
|---|
 | 156 |        $num_dontextract = scalar @$aref if scalar @$aref > $num_dontextract; | 
|---|
 | 157 |    } | 
|---|
 | 158 | } | 
|---|
 | 159 |  | 
|---|
 | 160 | foreach $lang (@langs) { | 
|---|
| [dc639a8] | 161 |    my $fnm = $lang; | 
|---|
 | 162 |    $fnm =~ s/(_.*)$/\U$1/; | 
|---|
 | 163 |    open OUT, ">$fnm.h" or die $!; | 
|---|
| [d5bd3a7] | 164 |    print OUT "#define N_DONTEXTRACTMSGS ", $num_dontextract, "\n"; | 
|---|
 | 165 |    print OUT "static unsigned char dontextractmsgs[] ="; | 
|---|
 | 166 |  | 
|---|
 | 167 |    for my $n (0 .. $num_dontextract - 1) { | 
|---|
 | 168 |       print OUT "\n"; | 
|---|
 | 169 |  | 
|---|
 | 170 |       my $aref = $dontextract{$lang}; | 
|---|
 | 171 |   | 
|---|
 | 172 |       my $parentaref; | 
|---|
 | 173 |       my $mainlang = $lang; | 
|---|
 | 174 |       $parentaref = $dontextract{$mainlang} if $mainlang =~ s/_.*$//; | 
|---|
 | 175 |  | 
|---|
 | 176 |       my $msg = $$aref[$n]; | 
|---|
 | 177 |       if (!defined $msg) { | 
|---|
 | 178 |          $msg = $$parentaref[$n] if defined $parentaref; | 
|---|
 | 179 |          if (!defined $msg) { | 
|---|
 | 180 |             $msg = ${$dontextract{'en'}}[$n]; | 
|---|
| [2083f8b] | 181 |             # don't report if we have a parent (as the omission will be reported there) | 
|---|
 | 182 |             if (defined $msg && $msg ne '' && !defined $parentaref) { | 
|---|
 | 183 |                print STDERR "Warning: message ", $dontextract_threshold + $n, " not in language $lang\n"; | 
|---|
 | 184 |                open TODO, ">>$lang.todo" or die $!; | 
|---|
 | 185 |                print TODO $raw{$dontextract_threshold + $n}, "\n"; | 
|---|
 | 186 |                close TODO; | 
|---|
| [d5bd3a7] | 187 |             } else { | 
|---|
 | 188 |                $msg = ''; | 
|---|
 | 189 |             } | 
|---|
 | 190 |          } | 
|---|
 | 191 |       } | 
|---|
 | 192 |       $msg =~ s/\\/\\\\/g; | 
|---|
 | 193 |       $msg =~ s/"/\\"/g; | 
|---|
 | 194 |       $msg =~ s/\t/\\t/g; | 
|---|
 | 195 |       $msg =~ s/\n/\\n/g; | 
|---|
 | 196 |       $msg =~ s/\r/\\r/g; | 
|---|
 | 197 |       if ($msg =~ /^ / || $msg =~ / $/) { | 
|---|
 | 198 |          $msg =~ s/\\"/\\\\\\"/g; | 
|---|
 | 199 |          $msg = '\\"'.$msg.'\\"'; | 
|---|
 | 200 |       } | 
|---|
 | 201 |       print OUT "   /*", $dontextract_threshold + $n, "*/ \"$msg\\0\""; | 
|---|
 | 202 |    } | 
|---|
 | 203 |    print OUT ";\n"; | 
|---|
 | 204 |    close OUT or die $!; | 
|---|
 | 205 | } | 
|---|
 | 206 |  | 
|---|
| [2083f8b] | 207 | #for (sort {$a<=>$b} keys %raw) {print "$_ = [$raw{$_}]\n";} | 
|---|
 | 208 |  | 
|---|
| [adaa778] | 209 | sub string_to_utf8 { | 
|---|
 | 210 |    my $s = shift; | 
|---|
| [7de787f] | 211 |    $s =~ s/([\x80-\xff])/char_to_utf8(ord($1))/eg; | 
|---|
| [adaa778] | 212 |    $s =~ s/\&(#\d+|#x[a-f0-9]+|[a-z0-9]+);?/decode_entity($1)/eig; | 
|---|
 | 213 |    return $s; | 
|---|
 | 214 | } | 
|---|
 | 215 |  | 
|---|
 | 216 | sub decode_entity { | 
|---|
 | 217 |    my $ent = shift; | 
|---|
 | 218 |    return char_to_utf8($1) if $ent =~ /^#(\d+)$/; | 
|---|
 | 219 |    return char_to_utf8(hex($1)) if $ent =~ /^#x([a-f0-9]+)$/; | 
|---|
 | 220 |    return char_to_utf8($ent{$ent}) if exists $ent{$ent}; | 
|---|
 | 221 |    $ent = "\&$ent;"; | 
|---|
 | 222 |    print STDERR "Warning: entity \"$ent\" unknown\n"; | 
|---|
 | 223 |    return $ent; | 
|---|
 | 224 | } | 
|---|
 | 225 |  | 
|---|
 | 226 | sub char_to_utf8 { | 
|---|
 | 227 |    my $unicode = shift; | 
|---|
 | 228 |    # ASCII is easy, and the most common case | 
|---|
 | 229 |    return chr($unicode) if $unicode < 0x80; | 
|---|
 | 230 |  | 
|---|
 | 231 |    my $result = ''; | 
|---|
 | 232 |    my $n = 0x20; | 
|---|
 | 233 |    while (1) { | 
|---|
 | 234 |       $result = chr(0x80 | ($unicode & 0x3f)) . $result; | 
|---|
 | 235 |       $unicode >>= 6; | 
|---|
 | 236 |       last if $unicode < $n; | 
|---|
 | 237 |       $n >>= 1; | 
|---|
 | 238 |    } | 
|---|
 | 239 |    $result = chr((0x100 - $n*2) | $unicode) . $result; | 
|---|
 | 240 |    return $result; | 
|---|
 | 241 | } | 
|---|
| [d5bd3a7] | 242 |  | 
|---|