--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/charconvfw/Charconv/ongoing/Group/CHARCONV.PL Tue Feb 02 02:02:46 2010 +0200
@@ -0,0 +1,536 @@
+#
+# Copyright (c) 2000 Nokia Corporation and/or its subsidiary(-ies).
+# All rights reserved.
+# This component and the accompanying materials are made available
+# under the terms of "Eclipse Public License v1.0"
+# which accompanies this distribution, and is available
+# at the URL "http://www.eclipse.org/legal/epl-v10.html".
+#
+# Initial Contributors:
+# Nokia Corporation - initial contribution.
+#
+# Contributors:
+#
+# Description:
+#
+
+use strict;
+use integer;
+
+sub PerlScriptPath
+ {
+ my $perlScriptPath=$0;
+ $perlScriptPath=~s/\//\\/g; # replace any forward-slashes with back-slashes
+ $perlScriptPath=~s/(\\?)[^\\]+$/$1/; # get rid of this Perl-script's file-name
+ return $perlScriptPath;
+ }
+BEGIN
+ {
+ unshift(@INC, &PerlScriptPath()); # can't do "use lib &PerlScriptPath()" here as "use lib" only seems to work with *hard-coded* directory names
+ }
+use PARSER;
+use UTF;
+
+# The following numbers are used for byte-orders:
+# 0 means unspecified
+# 1 means big-endian
+# 2 means little-endian
+
+FixParametersToWorkWithWindows98(\@ARGV);
+my $versionNumber = 3;
+my $outputByteOrderMark = 0;
+my $unicodeByteOrder = 0;
+my $inputEncoding = "";
+my $outputEncoding = "";
+my %foreignCharacters = (); # Hash with the foreign Character code as the value, unicode as key
+my %unicodeCharacters = (); # Hash with the Unicode Character code as the value, foreign as key
+
+
+my $inputFile=\*STDIN;
+my $outputFile=\*STDOUT;
+ReadParameters(\@ARGV,\$outputByteOrderMark,\$unicodeByteOrder,\$inputEncoding,\$outputEncoding,\$inputFile,\$outputFile);
+HandleByteOrderMarks($outputByteOrderMark,\$unicodeByteOrder, \$inputEncoding,\$outputEncoding, $inputFile, $outputFile);
+DoConversion(\$unicodeByteOrder, \$inputEncoding, \$outputEncoding, $inputFile, $outputFile, \%foreignCharacters, \%unicodeCharacters);
+if ($inputFile!=\*STDIN)
+ {
+ close($inputFile) or die;
+ }
+if ($outputFile!=\*STDOUT)
+ {
+ close($outputFile) or die;
+ }
+
+sub FixParametersToWorkWithWindows98
+ {
+ my $parameters=shift;
+ my $i;
+ for ($i=@$parameters-2; $i>=0; --$i) # iterate backwards as some parameters may be deleted from @$parameters
+ {
+ if (($parameters->[$i]=~/^(-input)$/i) ||
+ ($parameters->[$i]=~/^(-output)$/i))
+ {
+ $parameters->[$i].='='.$parameters->[$i+1];
+ splice(@$parameters, $i+1, 1);
+ }
+ }
+ }
+
+sub PrintUsage
+ {
+ print "\nVersion $versionNumber\n\nCharacter set conversion tool\nCopyright (c) 1999 Symbian Ltd\n\n";
+ print "Usage:\n\n\t charconv [<options>] <inputspec> <outputspec>\n\nwhere\n\n\t";
+ print "options := [-big|-little][-byteordermark]\n\t";
+ print "inputspec := -input=<format> [<input_file>]\n\t";
+ print "outputspec := -output=<format> [<output_file>]\n\t";
+ print "format := unicode|utf8|big5|gb2312...\n\n";
+ }
+
+sub Assert
+ {
+ my $condition = shift;
+ my $errorMessage = shift;
+ if (!($condition)) # find out where this is used and work this out
+ {
+ die("Error: $errorMessage");
+ }
+ }
+
+sub PrintWarning
+ {
+ my $warningMessage = shift;
+ print STDERR "Warning: $warningMessage\n";
+ }
+
+
+sub TryFileParameter
+ {
+ my $args = shift;
+ my $argindex = shift;
+ my $inputoroutput = shift;
+ my $encoding = shift;
+ my $filehandle = shift;
+ my $prefix = "-$inputoroutput=";
+
+ if ($args->[$$argindex] =~ /^$prefix(.*)/)
+ {
+ Assert($$encoding eq "", "\"$prefix...\" is specified more than once");
+ $$encoding = $1;
+ ++$$argindex;
+ if (($$argindex >= @$args) || ($args->[$$argindex] =~ /^-/))
+ {
+ --$$argindex;
+ }
+ else
+ {
+ if ($inputoroutput =~ /input/i)
+ {
+ open(INPUT_FILE,"<$args->[$$argindex]") or die "opening $inputoroutput-file failed $!";
+ $$filehandle=\*INPUT_FILE;
+ }
+ else
+ {
+ open(OUTPUT_FILE,">$args->[$$argindex]") or die "opening $inputoroutput-file failed $!";
+ $$filehandle=\*OUTPUT_FILE;
+ }
+ }
+ binmode $$filehandle;
+ return 1;
+ }
+ return 0;
+ }
+
+sub ReadParameters
+ {
+ my $args = shift;
+ my $outputbyteordermark = shift;
+ my $unicodebyteorder = shift;
+ my $inputencoding = shift;
+ my $outputencoding = shift;
+ my $inputhandle = shift;
+ my $outputhandle = shift;
+ my $i;
+ my $range;
+ if ((@$args <= 0) || ($args->[0] eq "?") || ($args->[0] eq "/?"))
+ {
+ PrintUsage();
+ exit;
+ }
+
+ for ($i = 0; $i < @$args ; ++$i)
+ {
+ if ( $args->[$i]=~ /-byteordermark/i)
+ {
+ Assert(!$$outputbyteordermark, "\"-byteordermark\" is specified more than once");
+ $$outputbyteordermark = 1;
+ }
+ elsif ($args->[$i]=~ /-big/i)
+ {
+ Assert(($$unicodebyteorder==0),"the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once");
+ $$unicodebyteorder = 1;
+ }
+ elsif ($args->[$i]=~ /-little/i)
+ {
+ Assert(($$unicodebyteorder==0),"the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once");
+ $$unicodebyteorder = 2;
+ }
+ else
+ {
+ Assert(TryFileParameter($args, \$i, "input",$inputencoding,$inputhandle) ||
+ TryFileParameter($args, \$i, "output",$outputencoding, $outputhandle), "bad parameter \"$args->[$i]\"");
+ }
+ }
+ Assert($$inputencoding ne "", "no input encoding is specified");
+ Assert($$outputencoding ne "", "no output encoding is specified");
+ }
+
+sub ReadFromFile
+ {
+ my $buffer = shift;
+ my $numOfBytesToRead = shift;
+ my $inputhandle = shift;
+ my $numOfBytesRead = 0;
+ my $numOfBytesToReadThisTime = $numOfBytesToRead;
+
+ for(;;)
+ {
+ for(;;)
+ {
+ my $remainingNumOfBytesToRead = $numOfBytesToRead - $numOfBytesRead;
+ if ($numOfBytesToReadThisTime > $remainingNumOfBytesToRead)
+ {
+ $numOfBytesToReadThisTime = $remainingNumOfBytesToRead;
+ }
+ my $numOfBytesReadThisTime = read $inputhandle, $$buffer, $numOfBytesToReadThisTime;
+ if (defined $numOfBytesReadThisTime)
+ {
+ $numOfBytesRead += $numOfBytesReadThisTime;
+ Assert($numOfBytesRead <= $numOfBytesReadThisTime, "internal error (read too many bytes)");
+ if (($numOfBytesRead >= $numOfBytesReadThisTime) || $numOfBytesReadThisTime == 0)
+ {
+ return;
+ }
+ last;
+ }
+ $numOfBytesToReadThisTime /= 2;
+ Assert($numOfBytesToReadThisTime >0, "reading from file failed");
+ }
+ }
+ }
+
+sub HandleByteOrderMarks
+ {
+ my $outputbyteordermark = shift;
+ my $unicodebyteorder = shift;
+ my $inputencoding = shift;
+ my $outputencoding = shift;
+ my $inputhandle = shift;
+ my $outputhandle = shift;
+
+ if ($$inputencoding =~ /unicode/i)
+ {
+ my $firstUnicodeCharacter = 0;
+ ReadFromFile(\$firstUnicodeCharacter, 2, $inputhandle);
+ my $byteOrderSpecifiedByByteOrderMark = 0;
+ if (length($firstUnicodeCharacter) == 2)
+ {
+ my @firstUnicodeCharacter = unpack "C*", $firstUnicodeCharacter;
+ if (($firstUnicodeCharacter[0]==0xff) && ($firstUnicodeCharacter[1]==0xfe))
+ {
+ $byteOrderSpecifiedByByteOrderMark = 2;
+ }
+ elsif (($firstUnicodeCharacter[0]==0xfe) && ($firstUnicodeCharacter[1]==0xff))
+ {
+ $byteOrderSpecifiedByByteOrderMark = 1;
+ }
+ else
+ {
+ my $error = seek $inputhandle, 0, 0; # rewind to start of file
+ Assert ($error == 1, "could not rewind to the start of input file");
+ }
+ }
+ if ($byteOrderSpecifiedByByteOrderMark!=0)
+ {
+ if (($$unicodebyteorder!=0) && ($byteOrderSpecifiedByByteOrderMark!=$$unicodebyteorder))
+ {
+ PrintWarning ("the byte order specified by the byte-order mark in the unicode input is different from the byte order specified by the parameter - taking the byte-order specified by the byte-order mark in the unicode input");
+ }
+ $$unicodebyteorder = $byteOrderSpecifiedByByteOrderMark;
+ }
+ }
+ if ($outputbyteordermark)
+ {
+ if ($$outputencoding ne "unicode")
+ {
+ PrintWarning("\"-byteordermark\" is only relevant for unicode output");
+ }
+ else
+ {
+ Assert($$unicodebyteorder!=0, "the byte order must be specified if a byte-order mark is to be added to the unicode output");
+ my $firstUnicodeCharacter=($$unicodebyteorder==1)? "\xfe\xff": "\xff\xfe";
+ WriteToFile(\$firstUnicodeCharacter, $outputhandle);
+ }
+ }
+ }
+
+sub WriteToFile
+ {
+ my $buffer = shift;
+ my $outputhandle = shift;
+
+ print $outputhandle $$buffer;
+ }
+
+sub DoConversion
+ {
+ my $unicodebyteorder = shift;
+ my $inputencoding = shift;
+ my $outputencoding = shift;
+ my $inputhandle = shift;
+ my $outputhandle = shift;
+ my $foreignCharacters = shift;
+ my $unicodeCharacters = shift;
+
+ my $currentBuffer = 0;
+ my @arrayOfBuffers = ('', '', '');
+ my $largeNumber=1000000;
+ ReadFromFile(\($arrayOfBuffers[$currentBuffer]), $largeNumber, $inputhandle);
+ ReverseByteOrderIfUnicodeAndBigEndian($unicodebyteorder, $inputencoding, \($arrayOfBuffers[$currentBuffer]));
+ if ($$inputencoding ne $$outputencoding)
+ {
+ if ($$inputencoding !~ /^unicode$/i)
+ {
+ my $nextBuffer = $currentBuffer + 1;
+ OtherToUnicode ($inputencoding, \($arrayOfBuffers[$nextBuffer]), ($arrayOfBuffers[$currentBuffer]), $foreignCharacters, $unicodeCharacters, 'v');
+ $currentBuffer = $nextBuffer;
+ }
+ if ($$outputencoding !~ /^unicode$/i)
+ {
+ my $nextBuffer = $currentBuffer + 1;
+ UnicodeToOther($outputencoding, \($arrayOfBuffers[$nextBuffer]), ($arrayOfBuffers[$currentBuffer]), $foreignCharacters, $unicodeCharacters, 'v');
+ $currentBuffer = $nextBuffer;
+ }
+ }
+ ReverseByteOrderIfUnicodeAndBigEndian($unicodebyteorder, $outputencoding, \($arrayOfBuffers[$currentBuffer]));
+ WriteToFile(\($arrayOfBuffers[$currentBuffer]), $outputhandle);
+ }
+
+sub ReverseByteOrderIfUnicodeAndBigEndian
+ {
+ my $unicodebyteorder = shift;
+ my $encoding = shift;
+ my $buffer = shift;
+ my $i;
+
+ if ($$encoding =~ /^unicode$/i)
+ {
+ Assert(length($$buffer)%2==0, "internal error (bad number of bytes in unicode buffer)");
+ if ($$unicodebyteorder==0)
+ {
+ PrintWarning("the byte order of unicode text is unspecified - defaulting to little-endian");
+ $$unicodebyteorder = 2;
+ }
+ if ($$unicodebyteorder==1)
+ {
+ $$buffer=pack('v*', unpack('n*', $$buffer));
+ }
+ }
+ }
+
+sub FillInHashes
+ {
+ my $foreignCharacters = shift;
+ my $unicodeCharacters = shift;
+ my $encoding = shift;
+ my $replacementCharacter = shift;
+ my $ranges = shift;
+ my $bigEndian = shift;
+
+ my $endianness = 0;
+ my $replacenum = 0;
+ my $rangenum = 0;
+ my $fileread = 0;
+ my $largenumber = 1000000;
+
+ my $dataFile=&PerlScriptPath()."charconv\\".$$encoding.'.dat';
+
+ my $line;
+
+ if (-e $dataFile)
+ {
+ open (HASH_INPUT, "< $dataFile") or die ("Could not open file for reading");
+
+ binmode HASH_INPUT;
+ # reading the endianness
+ $fileread = read HASH_INPUT, $endianness, 1;
+ $endianness = unpack "C",$endianness;
+ if ($endianness == 0)
+ {
+ # set the template to a default-> n for the eman time
+ $$bigEndian = 0;
+ }
+ elsif ($endianness == 1)
+ {
+ $$bigEndian = 0;
+ }
+ elsif ($endianness == 2)
+ {
+ $$bigEndian = 1;
+ }
+ else
+ {
+ print "Illegal Endianness specified in the control files";
+ }
+ #reading the replacement characters
+ $fileread = read HASH_INPUT, $replacenum,1;
+ $replacenum= unpack "C",$replacenum;
+ $fileread = read HASH_INPUT, $$replacementCharacter,$replacenum;
+ # reading the ranges
+ $fileread = read HASH_INPUT, $rangenum, 1;
+ $rangenum = unpack "C",$rangenum;
+ my $i; # loop variable
+ for ($i=0; $i < $rangenum; ++$i)
+ {
+ my $lowerrange = 0;
+ my $upperrange = 0;
+ my $followchar = 0;
+
+ $fileread = read HASH_INPUT,$lowerrange,1;
+ $lowerrange = unpack "C",$lowerrange;
+ $fileread = read HASH_INPUT,$upperrange,1;
+ $upperrange = unpack "C",$upperrange;
+ $fileread = read HASH_INPUT,$followchar,1;
+ $followchar = unpack "C",$followchar;
+
+ push @$ranges,[$lowerrange,$upperrange,$followchar];
+ }
+ my $data = 0;
+ my @unpackeddata = 0;
+ $fileread = read HASH_INPUT, $data, $largenumber;
+ @unpackeddata = unpack "v*",$data;
+ for($i = 0; $i <= $#unpackeddata; $i= $i+2)
+ {
+ $unicodeCharacters->{$unpackeddata[$i]}=$unpackeddata[$i+1];
+ $foreignCharacters->{$unpackeddata[$i+1]}=$unpackeddata[$i];
+ }
+ }
+ else
+ {
+ die ("Encoding Format \"$$encoding\" not recognised");
+ }
+ }
+
+sub OtherToUnicode
+ {
+ my $inputencoding = shift;
+ my $unicode = shift;
+ my $other = shift;
+ my $foreignCharacters = shift;
+ my $unicodeCharacters = shift;
+ my $unicodetemplate = shift;
+ my $replacementCharacter = 0;
+ my $unicodeReplacementCharacter = pack($unicodetemplate, 0xfffd);
+ my @ranges=();
+
+ my $otherIndex= 0;
+ my $numOfBytes = length($other);
+ my $key = 0;
+ my $inRange = 0;
+ my $followByte = -1;
+
+ if ($$inputencoding=~/^utf8$/i)
+ {
+ return &Utf8ToUnicode($unicode, $other, $unicodetemplate);
+ }
+ my $bigEndian;
+ FillInHashes($foreignCharacters,$unicodeCharacters, $inputencoding, \$replacementCharacter,\@ranges,\$bigEndian);
+ for (;;)
+ {
+ if ($otherIndex > $numOfBytes -1)
+ {
+ last;
+ }
+ my $frontByte = (unpack("x$otherIndex".'C', $other))[0];
+ # @ranges is an array of references. Each reference is a reference to an array
+ for ($key = 0; $key <= $#ranges; ++$key)
+ {
+ my $arrayref = $ranges[$key];
+ if (($frontByte >= $arrayref->[0]) && ($frontByte <= $arrayref->[1]))
+ {
+ $followByte = $arrayref->[2];
+ $inRange = 1;
+ }
+ }
+ Assert ($inRange != 0, "cannot figure out the Byte size of the character");
+ my $tempByte = 0;
+ for ($key = 0; $key<= $followByte; ++$key)
+ {
+ if ($bigEndian)
+ {
+ $tempByte = ($tempByte << 8) | (unpack("x$otherIndex".'C', $other))[0];
+ }
+ else
+ {
+ $tempByte = $tempByte | ((unpack("x$otherIndex".'C', $other))[0] << (8*$key));
+ }
+ $otherIndex++;
+ }
+ if (exists $unicodeCharacters->{$tempByte})
+ {
+ $$unicode .= pack $unicodetemplate , $unicodeCharacters->{$tempByte};
+ }
+ else
+ {
+ $$unicode .= $unicodeReplacementCharacter;
+ }
+ }
+ }
+
+sub UnicodeToOther
+ {
+ my $outputencoding = shift;
+ my $other = shift;
+ my $unicode = shift;
+ my $foreignCharacters = shift;
+ my $unicodeCharacters = shift;
+ my $unicodetemplate = shift;
+ my $replacementCharacter = 0;
+ my @ranges=();
+
+ my $unicodeIndex= 0;
+ my $numOfBytes = length($unicode);
+ my @UnicodeUnpacked = ();
+ my $key = 0;
+
+ if ($$outputencoding=~/^utf8$/i)
+ {
+ return &UnicodeToUtf8($other, $unicode, $unicodetemplate);
+ }
+ my $bigEndian;
+ FillInHashes($foreignCharacters,$unicodeCharacters, $outputencoding, \$replacementCharacter,\@ranges,\$bigEndian);
+ my $foreignTemplate=$bigEndian? 'n': 'v';
+ @UnicodeUnpacked = unpack "$unicodetemplate*", $unicode;
+ foreach $key (@UnicodeUnpacked)
+ {
+ if (!exists($foreignCharacters->{$key}))
+ {
+ $$other .= $replacementCharacter;
+ }
+ else
+ {
+ # This is the WRONG but it will work for the mean time
+ # This will fail if the foreignCharacter has characters that are more than
+ # two bytes long ..... But this should work for foreign characters of 1 or 2 Bytes
+
+ my $foreignValue = $foreignCharacters->{$key};
+ if ( $foreignValue <= 255)
+ {
+ $$other .= pack "C" , $foreignValue;
+ }
+ else
+ {
+ $$other .= pack $foreignTemplate, $foreignValue;
+ }
+ }
+ }
+ }
+