charconvfw/Charconv/ongoing/Group/CHARCONV.PL
changeset 0 1fb32624e06b
child 16 56cd22a7a1cb
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/charconvfw/Charconv/ongoing/Group/CHARCONV.PL	Tue Feb 02 02:02:46 2010 +0200
@@ -0,0 +1,536 @@
+#
+# Copyright (c) 2000 Nokia Corporation and/or its subsidiary(-ies).
+# All rights reserved.
+# This component and the accompanying materials are made available
+# under the terms of "Eclipse Public License v1.0"
+# which accompanies this distribution, and is available
+# at the URL "http://www.eclipse.org/legal/epl-v10.html".
+#
+# Initial Contributors:
+# Nokia Corporation - initial contribution.
+#
+# Contributors:
+#
+# Description:      
+#
+
+use strict;
+use integer;
+
+sub PerlScriptPath
+	{
+	my $perlScriptPath=$0;
+	$perlScriptPath=~s/\//\\/g; # replace any forward-slashes with back-slashes
+	$perlScriptPath=~s/(\\?)[^\\]+$/$1/; # get rid of this Perl-script's file-name
+	return $perlScriptPath;
+	}
+BEGIN
+	{
+	unshift(@INC, &PerlScriptPath()); # can't do "use lib &PerlScriptPath()" here as "use lib" only seems to work with *hard-coded* directory names
+	}
+use PARSER;
+use UTF;
+
+# The following numbers are used for byte-orders:
+#     0 means unspecified
+#     1 means big-endian
+#     2 means little-endian
+
+FixParametersToWorkWithWindows98(\@ARGV);
+my $versionNumber = 3;
+my $outputByteOrderMark = 0;
+my $unicodeByteOrder = 0;
+my $inputEncoding = ""; 
+my $outputEncoding = ""; 
+my %foreignCharacters = (); # Hash with the foreign Character code as the value, unicode as key 
+my %unicodeCharacters = (); # Hash with the Unicode Character code as the value, foreign as key 
+
+
+my $inputFile=\*STDIN;
+my $outputFile=\*STDOUT;
+ReadParameters(\@ARGV,\$outputByteOrderMark,\$unicodeByteOrder,\$inputEncoding,\$outputEncoding,\$inputFile,\$outputFile);
+HandleByteOrderMarks($outputByteOrderMark,\$unicodeByteOrder, \$inputEncoding,\$outputEncoding, $inputFile, $outputFile);
+DoConversion(\$unicodeByteOrder, \$inputEncoding, \$outputEncoding, $inputFile, $outputFile, \%foreignCharacters, \%unicodeCharacters);
+if ($inputFile!=\*STDIN)
+	{
+	close($inputFile) or die;
+	}
+if ($outputFile!=\*STDOUT)
+	{
+	close($outputFile) or die;
+	}
+
+sub FixParametersToWorkWithWindows98
+	{
+	my $parameters=shift;
+	my $i;
+	for ($i=@$parameters-2; $i>=0; --$i) # iterate backwards as some parameters may be deleted from @$parameters
+		{
+		if (($parameters->[$i]=~/^(-input)$/i) ||
+			($parameters->[$i]=~/^(-output)$/i))
+			{
+			$parameters->[$i].='='.$parameters->[$i+1];
+			splice(@$parameters, $i+1, 1);
+			}
+		}
+	}
+
+sub PrintUsage 
+	{
+	print "\nVersion $versionNumber\n\nCharacter set conversion tool\nCopyright (c) 1999 Symbian Ltd\n\n";
+	print "Usage:\n\n\t charconv [<options>] <inputspec> <outputspec>\n\nwhere\n\n\t";
+	print "options    :=  [-big|-little][-byteordermark]\n\t";
+	print "inputspec  :=  -input=<format> [<input_file>]\n\t";
+	print "outputspec :=  -output=<format> [<output_file>]\n\t";
+	print "format     :=  unicode|utf8|big5|gb2312...\n\n";
+	}
+
+sub Assert
+	{
+	my $condition = shift;
+	my $errorMessage = shift;
+	if (!($condition)) # find out where this is used and work this out 
+		{
+		die("Error: $errorMessage");
+		}
+	}
+
+sub PrintWarning 
+	{
+	my $warningMessage = shift;
+	print STDERR "Warning: $warningMessage\n";
+	}
+	
+
+sub TryFileParameter 
+	{
+	my $args = shift;
+	my $argindex = shift;
+	my $inputoroutput = shift;
+	my $encoding = shift;
+	my $filehandle = shift;
+	my $prefix = "-$inputoroutput=";
+
+	if ($args->[$$argindex] =~ /^$prefix(.*)/)
+		{
+		Assert($$encoding eq "", "\"$prefix...\" is specified more than once");
+		$$encoding = $1;
+		++$$argindex;
+		if (($$argindex >= @$args) || ($args->[$$argindex] =~ /^-/))
+			{
+			--$$argindex;
+			}
+		else
+			{
+			if ($inputoroutput =~ /input/i)
+				{
+				open(INPUT_FILE,"<$args->[$$argindex]") or die "opening $inputoroutput-file failed $!";
+				$$filehandle=\*INPUT_FILE;
+				}
+			else
+				{
+				open(OUTPUT_FILE,">$args->[$$argindex]") or die "opening $inputoroutput-file failed $!";
+				$$filehandle=\*OUTPUT_FILE;
+				}
+			}
+		binmode $$filehandle;
+		return  1;
+		}
+	return 0; 
+	}
+
+sub ReadParameters 
+	{
+	my $args = shift; 
+	my $outputbyteordermark = shift;
+	my $unicodebyteorder = shift;
+	my $inputencoding = shift;
+	my $outputencoding = shift;
+	my $inputhandle = shift;
+	my $outputhandle = shift;
+	my $i;
+	my $range;
+	if ((@$args <= 0) || ($args->[0] eq "?") || ($args->[0] eq "/?"))
+		{
+		PrintUsage();
+		exit;
+		}
+
+	for ($i = 0; $i < @$args ; ++$i)
+		{
+		if ( $args->[$i]=~ /-byteordermark/i)
+			{
+			Assert(!$$outputbyteordermark, "\"-byteordermark\" is specified more than once");
+			$$outputbyteordermark = 1; 
+			}
+		elsif ($args->[$i]=~ /-big/i)
+			{
+			Assert(($$unicodebyteorder==0),"the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once");
+			$$unicodebyteorder = 1;
+			}
+		elsif ($args->[$i]=~ /-little/i)
+			{
+			Assert(($$unicodebyteorder==0),"the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once");
+			$$unicodebyteorder = 2;
+			}
+		else
+			{ 
+			Assert(TryFileParameter($args, \$i, "input",$inputencoding,$inputhandle) ||
+				   TryFileParameter($args, \$i, "output",$outputencoding, $outputhandle), "bad parameter \"$args->[$i]\"");
+			}
+		}
+	Assert($$inputencoding ne "", "no input encoding is specified");
+	Assert($$outputencoding ne "", "no output encoding is specified");
+	}
+
+sub ReadFromFile
+	{  
+	my $buffer = shift;
+	my $numOfBytesToRead = shift;
+	my $inputhandle = shift;
+	my $numOfBytesRead = 0;									
+	my $numOfBytesToReadThisTime = $numOfBytesToRead;		
+
+	for(;;)
+		{
+		for(;;)
+			{
+			my $remainingNumOfBytesToRead = $numOfBytesToRead - $numOfBytesRead;  
+			if ($numOfBytesToReadThisTime > $remainingNumOfBytesToRead)
+				{
+				$numOfBytesToReadThisTime = $remainingNumOfBytesToRead;
+				}  		
+			my $numOfBytesReadThisTime = read $inputhandle, $$buffer, $numOfBytesToReadThisTime;  
+			if (defined $numOfBytesReadThisTime)
+				{
+				$numOfBytesRead += $numOfBytesReadThisTime;
+				Assert($numOfBytesRead <= $numOfBytesReadThisTime, "internal error (read too many bytes)");
+				if (($numOfBytesRead >= $numOfBytesReadThisTime) || $numOfBytesReadThisTime == 0)
+					{
+					return;
+					}
+				last;
+				}
+			$numOfBytesToReadThisTime /= 2;
+			Assert($numOfBytesToReadThisTime >0, "reading from file failed");
+			}
+		}
+	}
+
+sub HandleByteOrderMarks 
+	{
+	my $outputbyteordermark = shift;
+	my $unicodebyteorder = shift;
+	my $inputencoding = shift;
+	my $outputencoding = shift;
+	my $inputhandle = shift;
+	my $outputhandle = shift;
+
+	if ($$inputencoding =~ /unicode/i) 
+		{
+		my $firstUnicodeCharacter = 0; 
+		ReadFromFile(\$firstUnicodeCharacter, 2, $inputhandle);
+		my $byteOrderSpecifiedByByteOrderMark = 0;
+		if (length($firstUnicodeCharacter) == 2)
+			{
+			my @firstUnicodeCharacter = unpack "C*", $firstUnicodeCharacter;
+			if (($firstUnicodeCharacter[0]==0xff) && ($firstUnicodeCharacter[1]==0xfe))
+				{
+				$byteOrderSpecifiedByByteOrderMark = 2; 
+				}
+			elsif (($firstUnicodeCharacter[0]==0xfe) && ($firstUnicodeCharacter[1]==0xff))
+				{
+				$byteOrderSpecifiedByByteOrderMark = 1; 
+				}
+			else
+				{
+				my $error = seek $inputhandle, 0, 0; # rewind to start of file
+				Assert ($error == 1, "could not rewind to the start of input file");
+				}
+			}
+		if ($byteOrderSpecifiedByByteOrderMark!=0)
+			{
+			if (($$unicodebyteorder!=0) && ($byteOrderSpecifiedByByteOrderMark!=$$unicodebyteorder))
+				{
+				PrintWarning ("the byte order specified by the byte-order mark in the unicode input is different from the byte order specified by the parameter - taking the byte-order specified by the byte-order mark in the unicode input");
+				}
+			$$unicodebyteorder = $byteOrderSpecifiedByByteOrderMark;
+			}
+		}
+	if ($outputbyteordermark)
+		{
+		if ($$outputencoding ne "unicode") 
+			{
+			PrintWarning("\"-byteordermark\" is only relevant for unicode output");
+			}
+		else
+			{
+			Assert($$unicodebyteorder!=0, "the byte order must be specified if a byte-order mark is to be added to the unicode output");
+			my $firstUnicodeCharacter=($$unicodebyteorder==1)? "\xfe\xff": "\xff\xfe";
+			WriteToFile(\$firstUnicodeCharacter, $outputhandle);
+			}
+		}
+	}
+
+sub WriteToFile 
+	{
+	my $buffer = shift;
+	my $outputhandle = shift;
+
+	print $outputhandle $$buffer;
+	}
+
+sub DoConversion
+	{
+	my $unicodebyteorder = shift;
+	my $inputencoding = shift;
+	my $outputencoding = shift;
+	my $inputhandle = shift;
+	my $outputhandle = shift; 
+	my $foreignCharacters = shift;
+	my $unicodeCharacters = shift;
+	
+	my $currentBuffer = 0;
+	my @arrayOfBuffers = ('', '', '');
+	my $largeNumber=1000000;
+	ReadFromFile(\($arrayOfBuffers[$currentBuffer]), $largeNumber, $inputhandle);
+	ReverseByteOrderIfUnicodeAndBigEndian($unicodebyteorder, $inputencoding, \($arrayOfBuffers[$currentBuffer]));
+	if ($$inputencoding ne $$outputencoding) 
+		{
+		if ($$inputencoding !~ /^unicode$/i)
+			{
+			my $nextBuffer = $currentBuffer + 1;
+			OtherToUnicode ($inputencoding, \($arrayOfBuffers[$nextBuffer]), ($arrayOfBuffers[$currentBuffer]), $foreignCharacters, $unicodeCharacters, 'v');
+			$currentBuffer = $nextBuffer;
+			}
+		if ($$outputencoding !~ /^unicode$/i)
+			{
+			my $nextBuffer = $currentBuffer + 1;
+			UnicodeToOther($outputencoding, \($arrayOfBuffers[$nextBuffer]), ($arrayOfBuffers[$currentBuffer]), $foreignCharacters, $unicodeCharacters, 'v');
+			$currentBuffer = $nextBuffer;
+			}
+		}
+	ReverseByteOrderIfUnicodeAndBigEndian($unicodebyteorder, $outputencoding, \($arrayOfBuffers[$currentBuffer]));
+	WriteToFile(\($arrayOfBuffers[$currentBuffer]), $outputhandle);
+	}
+
+sub ReverseByteOrderIfUnicodeAndBigEndian
+	{
+	my $unicodebyteorder = shift;
+	my $encoding = shift;
+	my $buffer = shift;
+	my $i;
+
+	if ($$encoding =~ /^unicode$/i)
+		{
+		Assert(length($$buffer)%2==0, "internal error (bad number of bytes in unicode buffer)");
+		if ($$unicodebyteorder==0)
+			{
+			PrintWarning("the byte order of unicode text is unspecified - defaulting to little-endian");
+			$$unicodebyteorder = 2;
+			}
+		if ($$unicodebyteorder==1)
+			{
+			$$buffer=pack('v*', unpack('n*', $$buffer));
+			}
+		}
+	}
+
+sub FillInHashes
+	{
+	my $foreignCharacters = shift;
+	my $unicodeCharacters = shift;
+	my $encoding = shift; 
+	my $replacementCharacter = shift;
+	my $ranges = shift;
+	my $bigEndian = shift;
+
+	my $endianness = 0;
+	my $replacenum = 0;
+	my $rangenum = 0;
+	my $fileread = 0;
+	my $largenumber = 1000000;
+
+	my $dataFile=&PerlScriptPath()."charconv\\".$$encoding.'.dat';
+
+	my $line;
+
+	if (-e $dataFile)
+		{
+		open (HASH_INPUT, "< $dataFile") or die ("Could not open file for reading");
+		
+		binmode HASH_INPUT;
+		# reading the endianness
+		$fileread = read HASH_INPUT, $endianness, 1;
+		$endianness = unpack "C",$endianness;
+		if ($endianness == 0)
+			{
+			# set the template to a default-> n for the eman time
+			$$bigEndian = 0;
+			}
+		elsif ($endianness == 1)
+			{
+			$$bigEndian = 0;
+			}
+		elsif ($endianness == 2)
+			{
+			$$bigEndian = 1;
+			}
+		else
+			{
+			print "Illegal Endianness specified in the control files";
+			}
+		#reading the replacement characters
+		$fileread = read HASH_INPUT, $replacenum,1;
+		$replacenum= unpack "C",$replacenum;
+		$fileread = read HASH_INPUT, $$replacementCharacter,$replacenum;
+		# reading the ranges
+		$fileread = read HASH_INPUT, $rangenum, 1;
+		$rangenum = unpack "C",$rangenum;
+		my $i; # loop variable 
+		for ($i=0; $i < $rangenum; ++$i)
+			{
+			my $lowerrange = 0;
+			my $upperrange = 0;
+			my $followchar = 0;
+
+			$fileread = read HASH_INPUT,$lowerrange,1;
+			$lowerrange = unpack "C",$lowerrange;
+			$fileread = read HASH_INPUT,$upperrange,1;
+			$upperrange = unpack "C",$upperrange;
+			$fileread = read HASH_INPUT,$followchar,1;
+			$followchar = unpack "C",$followchar;
+
+			push @$ranges,[$lowerrange,$upperrange,$followchar];
+			}
+		my $data = 0;
+		my @unpackeddata = 0;
+		$fileread = read HASH_INPUT, $data, $largenumber;
+		@unpackeddata = unpack "v*",$data;
+		for($i = 0; $i <= $#unpackeddata; $i= $i+2)
+			{
+			$unicodeCharacters->{$unpackeddata[$i]}=$unpackeddata[$i+1];
+			$foreignCharacters->{$unpackeddata[$i+1]}=$unpackeddata[$i];
+			}
+		}
+	else
+		{
+		die ("Encoding Format \"$$encoding\" not recognised");
+		}
+	}
+
+sub OtherToUnicode
+	{
+	my $inputencoding = shift;
+	my $unicode = shift;  
+	my $other = shift; 
+	my $foreignCharacters = shift;
+	my $unicodeCharacters = shift;
+	my $unicodetemplate = shift;
+	my $replacementCharacter = 0;
+	my $unicodeReplacementCharacter = pack($unicodetemplate, 0xfffd);
+	my @ranges=(); 
+ 
+	my $otherIndex= 0;
+	my $numOfBytes = length($other);
+	my $key = 0;
+	my $inRange = 0;
+	my $followByte = -1;
+
+	if ($$inputencoding=~/^utf8$/i)
+		{
+		return &Utf8ToUnicode($unicode, $other, $unicodetemplate);
+		}
+	my $bigEndian;
+	FillInHashes($foreignCharacters,$unicodeCharacters, $inputencoding, \$replacementCharacter,\@ranges,\$bigEndian);
+	for (;;)
+		{
+		if ($otherIndex > $numOfBytes -1) 
+			{
+			last;
+			}
+		my $frontByte = (unpack("x$otherIndex".'C', $other))[0];
+		# @ranges is an array of references. Each reference is a reference to an array
+		for ($key = 0; $key <= $#ranges; ++$key)
+			{
+			my $arrayref = $ranges[$key];
+			if (($frontByte >= $arrayref->[0]) && ($frontByte <= $arrayref->[1]))
+				{
+				$followByte = $arrayref->[2];
+				$inRange = 1;
+				}
+			}
+		Assert ($inRange != 0, "cannot figure out the Byte size of the character");
+		my $tempByte = 0;
+		for ($key = 0; $key<= $followByte; ++$key)
+			{
+			if ($bigEndian)
+				{
+				$tempByte = ($tempByte << 8) | (unpack("x$otherIndex".'C', $other))[0];
+				}
+			else
+				{
+				$tempByte = $tempByte | ((unpack("x$otherIndex".'C', $other))[0] << (8*$key));
+				}
+			$otherIndex++;	
+			}
+		if (exists $unicodeCharacters->{$tempByte})
+			{
+			$$unicode .= pack $unicodetemplate , $unicodeCharacters->{$tempByte};
+			}
+		else
+			{
+			$$unicode .= $unicodeReplacementCharacter;
+			}
+		}
+	}
+
+sub UnicodeToOther
+	{
+	my $outputencoding = shift;
+	my $other = shift;
+	my $unicode = shift;
+	my $foreignCharacters = shift;
+	my $unicodeCharacters = shift;
+	my $unicodetemplate = shift;
+	my $replacementCharacter = 0;
+	my @ranges=(); 
+
+	my $unicodeIndex= 0;
+	my $numOfBytes = length($unicode);
+	my @UnicodeUnpacked = ();
+	my $key = 0;
+
+	if ($$outputencoding=~/^utf8$/i)
+		{
+		return &UnicodeToUtf8($other, $unicode, $unicodetemplate);
+		}
+	my $bigEndian;
+	FillInHashes($foreignCharacters,$unicodeCharacters, $outputencoding, \$replacementCharacter,\@ranges,\$bigEndian);
+	my $foreignTemplate=$bigEndian? 'n': 'v';
+	@UnicodeUnpacked = unpack "$unicodetemplate*", $unicode;
+	foreach $key (@UnicodeUnpacked)
+		{
+		if (!exists($foreignCharacters->{$key}))
+			{
+			$$other .= $replacementCharacter;
+			}
+		else
+			{
+			# This is the WRONG but it will work for the mean time
+			# This will fail if the foreignCharacter has characters that are more than
+			# two bytes long ..... But this should work for foreign characters of 1 or 2 Bytes
+
+			my $foreignValue = $foreignCharacters->{$key};
+			if ( $foreignValue <= 255)
+				{
+				$$other .= pack "C" , $foreignValue;
+				}
+			else
+				{
+				$$other .= pack $foreignTemplate, $foreignValue;
+				}
+			}
+		}
+	}
+