codificação de textos

Olá, estou criando um pequeno software para aparelhos CLDC1.0 e MIDP1.0. Como há uma quantidade considerável de textos nesse software, criei arquivos txt que são lidos durante a execução. E aí surge um problema.

Se os arquivos txt são gravados em ANSI, aparelhos nokia os lêem normalmente, mas motorola não, apresentando caracteres acentuados incorretamente. Já em UTF-8 acontece o contrário…

Imagino que a resposta seja “não”, mas há alguma forma “universal” para gravar/ler esses textos? Caso não, há alguma função que eu possa usar para converter ANSI para UTF-8 ou vice-versa, como o utf8_encode do php?

Mais uma questão relacionada: uso esses arquivos externos assumindo que isso irá reduzir o consumo de memória de execução no aparelho, carregando os textos apenas no momento em que vou usá-los. Estou certo? Ou incorporar esses textos no código geraria o mesmo resultado?

Encontrei a saída em um site, usando funções de conversão da codificação dos caracters. Daí foi só fazer um teste simples com um arquivo padronizado e ao que parece está tudo ok. O código é o seguinte:

/*

Author &#58; Shivakumar
Mail   &#58; shiva &#40;at&#41; blisspark.com
Disclaimer &#58; This code is provided without any implied or expressed warranty and may not work as 
expected. If have any bugs, inform me or post the fix here.

*/

public static String UTF8Decode&#40;byte in&#91;&#93;, int offset, int length&#41;
&#123;
	StringBuffer buff = new StringBuffer&#40;&#41;;
	int max = offset + length;
	for&#40; int i = offset ; i &lt; max ; i++&#41;
	&#123;
		char c = 0;
		if&#40;&#40;in&#91;i&#93; &amp; 0x80&#41; == 0&#41;
		&#123;
			c = &#40;char&#41; in&#91;i&#93;;
		&#125;
		else if&#40;&#40; in&#91;i&#93; &amp; 0xe0 &#41; == 0xc0&#41;	// 11100000
		&#123;
			c |= &#40;&#40;in&#91;i&#93; &amp; 0x1f&#41; &lt;&lt; 6&#41;;		// 00011111
			i++;
			c |= &#40;&#40;in&#91;i&#93; &amp; 0x3f&#41; &lt;&lt; 0&#41;;		// 00111111
		&#125;
		else if&#40;&#40; in&#91;i&#93; &amp; 0xf0&#41; == 0xe0&#41;	// 11110000
		&#123;
			c |= &#40;&#40;in&#91;i&#93; &amp; 0x0f&#41; &lt;&lt; 12&#41;;	// 00001111
			i++;
			c |= &#40;&#40;in&#91;i&#93; &amp; 0x3f&#41; &lt;&lt; 6&#41;;		// 00111111
			i++;
			c |= &#40;&#40;in&#91;i&#93; &amp; 0x3f&#41; &lt;&lt; 0&#41;;		// 00111111
		&#125;
		else if&#40;&#40;in&#91;i&#93; &amp; 0xf8&#41; == 0xf0&#41;		// 11111000
		&#123;
			c |= &#40;&#40;in&#91;i&#93; &amp; 0x07&#41; &lt;&lt; 18&#41;;	// 00000111 &#40;move 18, not 16?&#41;
			i++;
			c |= &#40;&#40;in&#91;i&#93; &amp; 0x3f&#41; &lt;&lt; 12&#41;;	// 00111111
			i++;
			c |= &#40;&#40;in&#91;i&#93; &amp; 0x3f&#41; &lt;&lt; 6&#41;;		// 00111111
			i++;
			c |= &#40;&#40;in&#91;i&#93; &amp; 0x3f&#41; &lt;&lt; 0&#41;;		// 00111111
		&#125;
		else
		&#123;
			c = '?';
		&#125;
		buff.append&#40;c&#41;;
	&#125;
	return buff.toString&#40;&#41;;
&#125;

public static byte&#91;&#93; UTF8Encode&#40;String str&#41;
&#123;
	ByteArrayOutputStream bos = new ByteArrayOutputStream&#40;&#41;;
	try
	&#123;
		int strlen = str.length&#40;&#41;;

		for&#40; int i = 0 ; i &lt; strlen ; i++ &#41;
		&#123;
			char t = str.charAt&#40;i&#41;;
			int c = 0;
			c |= &#40; t &amp; 0xffff &#41;;

			if&#40;c &gt;= 0 &amp;&amp; c &lt; 0x80&#41;
			&#123;
				bos.write&#40;&#40;byte&#41;&#40; c &amp; 0xff &#41;&#41;;
			&#125;
			else if&#40;c &gt; 0x7f &amp;&amp; c &lt; 0x800&#41;
			&#123;
				bos.write&#40; &#40;byte&#41; &#40;&#40;&#40; c &gt;&gt;&gt; 6 &#41; &amp; 0x1f &#41; | 0xc0 &#41;&#41;;
				bos.write&#40; &#40;byte&#41; &#40;&#40;&#40; c &gt;&gt;&gt; 0 &#41; &amp; 0x3f &#41; | 0x80 &#41;&#41;;
			&#125;
			else if&#40;c &gt; 0x7ff &amp;&amp; c &lt; 0x10000&#41;
			&#123;
				bos.write&#40; &#40;byte&#41; &#40;&#40;&#40; c &gt;&gt;&gt; 12 &#41; &amp; 0x0f &#41; | 0xe0 &#41;&#41;; // &lt;-- correction &#40;mb&#41;
				bos.write&#40; &#40;byte&#41; &#40;&#40;&#40; c &gt;&gt;&gt; 6 &#41; &amp; 0x3f &#41; | 0x80 &#41;&#41;;
				bos.write&#40; &#40;byte&#41; &#40;&#40;&#40; c &gt;&gt;&gt; 0 &#41; &amp; 0x3f &#41; | 0x80 &#41;&#41;;
			&#125;
			else if&#40;c &gt; 0x00ffff &amp;&amp; c &lt; 0xfffff&#41;
			&#123;
				bos.write&#40; &#40;byte&#41; &#40;&#40;&#40; c &gt;&gt;&gt; 18 &#41; &amp; 0x07 &#41; | 0xf0 &#41;&#41;;
				bos.write&#40; &#40;byte&#41; &#40;&#40;&#40; c &gt;&gt;&gt; 12 &#41; &amp; 0x3f &#41; | 0x80 &#41;&#41;;
				bos.write&#40; &#40;byte&#41; &#40;&#40;&#40; c &gt;&gt;&gt; 6 &#41; &amp; 0x3f &#41; | 0x80 &#41;&#41;;
				bos.write&#40; &#40;byte&#41; &#40;&#40;&#40; c &gt;&gt;&gt; 0 &#41; &amp; 0x3f &#41; | 0x80 &#41;&#41;;
			&#125;
		&#125;
		bos.flush&#40;&#41;;
	&#125;
	catch&#40;Exception e&#41;
	&#123;
	&#125;
	return bos.toByteArray&#40;&#41;;
&#125;

A função de teste simples que criei foi:

private void checkUTF8&#40;&#41; &#123;
        // loading the check file that must have only an ã char and saved as UTF-8

        // var
        StringBuffer buffer = null;
        InputStream is = null;
        InputStreamReader isr = null;

        // loading the file
        try &#123;
            Class c = this.getClass&#40;&#41;;
            is = c.getResourceAsStream&#40;&quot;/data/check.txt&quot;&#41;;
            if &#40;is == null&#41; &#123;
                throw new Exception&#40;&quot;The file check.txt does not exist!&quot;&#41;;
            &#125;
            isr = new InputStreamReader&#40;is&#41;;
            buffer = new StringBuffer&#40;&#41;;
            int ch;
            while &#40;&#40;ch = isr.read&#40;&#41;&#41; &gt; -1&#41; &#123;
                buffer.append&#40;&#40;char&#41; ch&#41;;
            &#125;
            if &#40;isr != null&#41; &#123;
                isr.close&#40;&#41;;
            &#125;
        &#125; catch &#40;Exception ex&#41; &#123;
            System.out.println&#40;ex.toString&#40;&#41;&#41;;
        &#125;

        // now that the text is loaded, comparing the char read
        if &#40;buffer.toString&#40;&#41; == &quot;ã&quot;&#41; &#123;
            // this system reads texts as UTF-8
            this.useUTF8 = true;
        &#125; else &#123;
            // this system does not read as UTF-8
            this.useUTF8 = false;
        &#125;
    &#125;

O código foi retirado de [url]http://www.j2meforums.com/wiki/index.php/UTF-8_Encoder/Decoder[/url]

Mas ainda fica a questão sobre o uso desses arquivos externos. Ele é mesmo justificável pelos motivos que coloquei aí em cima?

1 Resposta

Topicos relacionados