Click to See Complete Forum and Search --> : Chinese characters conversion on UNIX AIX


efimka
December 27th, 2006, 04:48 AM
Hi
I have a specific problem with conversion of chinese characters on UNIX platform.

I need to convert string mixed with simplified chinese characters and numbers to simpified chinese buffer compatible with format UTF8
I have tried to use following function (see below)

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <malloc.h>
#include <limits.h>

wchar_t *pCommArea;

char ZeroChar = 0;
char ImexTQ[4];
char text[1000];

void main(void) {

long RespCode1Link, RespCode2Link;
unsigned int ch;
unsigned char bt1, bt2, bt3, bt4, bt5, bt6;
int n;
wchar_t sFinal[1000];
wchar_t sTemp[1000];

memcpy(ImexTQ, "IXT0", 4);
memset( sFinal, 0, 1000 );

EXEC CICS ADDRESS
EIB ( dfheiptr )
RESP ( RespCode1Link )
RESP2( RespCode2Link );

if( RespCode1Link != 0 ) {
strcpy(text, "Abend on addressing to EIB.");
EXEC CICS WRITEQ TD
QUEUE(ImexTQ)
FROM(text)
LENGTH(strlen(text));
EXEC CICS ABEND
ABCODE( "NRSP" )
RESP ( RespCode1Link )
RESP2 ( RespCode2Link )
CANCEL;
EXEC CICS RETURN;
} /* endif */

/* Get addressability to the CWA */
EXEC CICS ADDRESS
CWA ( pCwa )
RESP ( RespCode1Link )
RESP2( RespCode2Link );

if( RespCode1Link != 0 ) {
strcpy(text, "Abend on addressing to CWA.");
EXEC CICS WRITEQ TD
QUEUE(ImexTQ)
FROM(text)
LENGTH(strlen(text));
EXEC CICS ABEND
ABCODE( "NRSP" )
RESP ( RespCode1Link )
RESP2 ( RespCode2Link )
CANCEL;
EXEC CICS RETURN;
} /* endif */

/* Get addressability to the comm area. */
EXEC CICS ADDRESS
COMMAREA( pCommArea )
RESP ( RespCode1Link )
RESP2 ( RespCode2Link );


if( RespCode1Link != 0 ) {
strcpy(text, "Abend on addressing to CommArea.");
EXEC CICS WRITEQ TD
QUEUE(ImexTQ)
FROM(text)
LENGTH(strlen(text));
EXEC CICS ABEND
ABCODE( "NRSP" )
RESP ( RespCode1Link )
RESP2 ( RespCode2Link )
CANCEL;
EXEC CICS RETURN;
}

for (n = 0; n < wcslen(pCommArea); ++n) {
memset( sTemp, 0, 1000 );
ch = (unsigned int)pCommArea[n];
if (ch == L'=' ) {

swprintf( sTemp, L"=%02X", ch );
wcscat( sFinal, sTemp );

swprintf( sTemp, L"=%02X", ch );
wcscat( sFinal, sTemp );

sprintf(text, "2. sFinal = %s", sFinal);
EXEC CICS WRITEQ TD
QUEUE(ImexTQ)
FROM(text)
LENGTH(strlen(text));
}
else if (ch < 128) {

sTemp[0] = pCommArea[n];
wcscat( sFinal, sTemp );
}
else if (ch <= 2047) {

bt1 = (unsigned char)(192 + (ch / 64));
bt2 = (unsigned char)(128 + (ch % 64));
swprintf( sTemp, L"=%02X=%02X", bt1, bt2 );
wcscat( sFinal, sTemp );
}
else if (ch <= 65535) {

bt1 = (unsigned char)(224 + (ch / 4096));
bt2 = (unsigned char)(128 + ((ch / 64) % 64));
bt3 = (unsigned char)(128 + (ch % 64));

sprintf(text, "Byte1 = %d, Byte2 = %d, Byte3 = %d", bt1, bt2, bt3);
EXEC CICS WRITEQ TD
QUEUE(ImexTQ)
FROM(text)
LENGTH(strlen(text));

swprintf( sTemp, L"=%02x=%02x=%02x", bt1, bt2, bt3 );
wcscat( sFinal, sTemp );
}
else if (ch <= 2097151) {

bt1 = (unsigned char)(240 + (ch / 262144));
bt2 = (unsigned char)(128 + ((ch / 4096) % 64));
bt3 = (unsigned char)(128 + ((ch / 64) % 64));
bt4 = (unsigned char)(128 + (ch % 64));
swprintf( sTemp, L"=%02X=%02X=%02X=%02X", bt1, bt2, bt3, bt4 );
wcscat( sFinal, sTemp );
}
else if (ch <= 67108863) {

bt1 = (unsigned char)(248 + (ch / 16777216));
bt2 = (unsigned char)(128 + ((ch / 262144) % 64));
bt3 = (unsigned char)(128 + ((ch / 4096) % 64));
bt4 = (unsigned char)(128 + ((ch / 64) % 64));
bt5 = (unsigned char)(128 + (ch % 64));
swprintf( sTemp, L"=%02X=%02X=%02X=%02X=%02X", bt1, bt2, bt3, bt4, bt5 );
wcscat( sFinal, sTemp );
}
else if (ch <= 2147483647) {

bt1 = (unsigned char)(252 + (ch / 1073741824));
bt2 = (unsigned char)(128 + ((ch / 16777216) % 64));
bt3 = (unsigned char)(128 + ((ch / 262144) % 64));
bt4 = (unsigned char)(128 + ((ch / 4096) % 64));
bt5 = (unsigned char)(128 + ((ch / 64) % 64));
bt6 = (unsigned char)(128 + (ch % 64));
swprintf( sTemp, L"=%02X=%02X=%02X=%02X=%02X=%02X", bt1, bt2, bt3, bt4, bt5, bt6);
wcscat( sFinal, sTemp );
}
}

memset( pCommArea, 0, sizeof( pCommArea ) );
wcscpy( pCommArea, sFinal );

EXEC CICS RETURN;
}

Function does not work.

Please, help to solve the problem, because don't have enough experience in this knowledge area.

Thank you.

SuperKoko
December 27th, 2006, 12:40 PM
need to convert string mixed with simplified chinese characters and numbers to simpified chinese buffer compatible with format UTF8

What is the source format? Is the character set GB18030, GBK or GB2312 ? What's the encoding?

Is this code entirely yours?
I can see several possibilities:

You got this whole code from somewhere (e.g. on the Internet)
You got most of this code on the Internet but added a few things to make it a C program, such as "main()", including headers, etc.
You wrote this whole code on your own.


In the first case, I would suggest to drop this code, and write from scratch a new program (using setlocale(), wcstombs and mbstowcs) , or use the iconv program if it's available on your platform.

libiconv can be used as well:
http://www.gnu.org/software/libiconv/


Function does not work.

At which stage is there a problem?
Compilation? Execution?
If you've not written this code, the problem will most likely be at compile time, because this code simply is not valid C code... It's full of weird things that look like another language... It probably requires using a special compiler/pre-processor.

Moreover, this code seems full of bugs:

memcpy(ImexTQ, "IXT0", 4); /* this string is not zero-terminated... Is it intended? */
memset( sFinal, 0, 1000 ); /* If sizeof(wchar_t)>sizeof(char), it doesn't fill the entire sFinal buffer with zeroes */


http://catb.org/~esr/faqs/smart-questions.html

efimka
December 28th, 2006, 04:48 AM
Thank you for quick response.
1.
What is the source format? - Simplified chinese.
Is the character set GB18030, GBK or GB2312 ?
GB2312

What's the encoding?
Destination is: UTF-8 format. Result should be saved in Oracle database
with NLS_CHARACTERSET=UTF8.

2. Is this code entirely yours?
I can see several possibilities:
You got this whole code from somewhere (e.g. on the Internet)
You got most of this code on the Internet but added a few things to make it a C program, such as "main()", including headers, etc.
You wrote this whole code on your own.

Answer:
Original Code was sent by our chinese customer and was written using C++.
http://www.codeproject.com/string/UTF8.asp

We're programming on UNIX AIX under IBM TX Series and don't have C++ compiliers available.

3. iconv is not avaiable for use.

How to use setlocale command for "chinese-simplified" ?
I have tried several options without success, except of one

setlocale(LC_ALL, "C") or setlocale(LC_CTYPE, "C")

4. memcpy(ImexTQ, "IXT0", 4); works OK, but could be used as null-terminated.

memset( sFinal, 0, 1000 ); could be removed from code at all if another solution will be found.

Thank you for explanations.

One more question.

SuperKoko
December 30th, 2006, 07:20 AM
Sorry, but this link from which you've extracted your conversion function:
http://www.codeproject.com/string/UTF8.asp

Shows a function that applies two encodings successively:
The source encoding is: UCS-2 (http://en.wikipedia.org/wiki/UCS-2): unicode encoding with 2 bytes for each character and only able to encode BMP characters (i.e. characters < 65536).
Then, it converts to UTF-8 (http://en.wikipedia.org/wiki/UTF-8) (a multibyte encoding using 8 bits bytes).
Then, it converts from UTF-8 to the quoted-printable (http://en.wikipedia.org/wiki/Quoted-printable) content-transfer-encoding... An encoding that represents all the 256 byte values of 8 bits bytes in a 7 bits text stream...
In the original code, each of these 7 bit character was stored in a 2 bytes WCHAR.

I'm not sure it's what you want.

Here is a mean to convert from GB2312 to UTF-8:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
#include <locale.h>
#include <limits.h>

enum EncodingConversionError {ECE_OK, ECE_UNEXPECTED, ECE_INVALID_MULTIBYTE, ECE_OUT_OF_MEMORY, ECE_CHARACTER_UNSUPPORTED, ECE_ENCODING_UNSUPPORTED};

const char* TranslateEncodingConversionError(enum EncodingConversionError ece) {
static const char* messages[]={
"No error"
,"Unexpected error"
,"Invalid mutlibyte character sequence"
,"Out of memory"
,"Multibyte character set doesn't contain this character"
,"Character encoding (LC_CTYPE) unknown or unsupported"
};
return messages[ece];
}

enum EncodingConversionError wcslen_of_mbs(const char* s, size_t src_size, size_t* length) {
size_t mbs_len=src_size;
size_t wc_len=0;
int mbln=0;

*length=0;
mblen(NULL, mbs_len);
while(mbs_len>0 && (mbln=mblen(s, mbs_len))>0) {
++wc_len;
s+=mbln;
mbs_len-=mbln;
}
if (mbs_len==0) {*length=wc_len;return ECE_OK;}
if (mbln<0) return ECE_INVALID_MULTIBYTE;
*length=wc_len;
return ECE_OK;
}
enum EncodingConversionError mbslen_of_wcs(const wchar_t* s, size_t src_size, size_t* length) {
size_t dest_size=0;
int mbln;
char mbc[MB_LEN_MAX];

wctomb(NULL, src_size);
while(src_size>0 && (mbln=wctomb(mbc, *s))>0) {
dest_size+=mbln;
--src_size;
++s;
}
if (src_size==0) {*length=dest_size;return ECE_OK;}
if (mbln<0) return ECE_CHARACTER_UNSUPPORTED;
*length=dest_size;
return ECE_OK;
}

enum EncodingConversionError alloc_mbstowcs(
const char* const source
,wchar_t** const destination
) {
enum EncodingConversionError err;
size_t src_size,dest_size;
wchar_t* dest;

*destination=NULL;
if (!source) return ECE_OK;

src_size = strlen(source);
if ((err=wcslen_of_mbs(source, src_size, &dest_size)))
return err;

dest = malloc(sizeof(wchar_t)*(dest_size+1));
if (!dest) return ECE_OUT_OF_MEMORY;
{
size_t dest_final_size;
dest_final_size=mbstowcs(dest, source, dest_size);

if (dest_final_size != dest_size) {
free(dest);
if (dest_final_size == (size_t)-1) return ECE_INVALID_MULTIBYTE;
else return ECE_UNEXPECTED;
}
}
dest[dest_size]=0;
*destination=dest;
return ECE_OK;
}
enum EncodingConversionError lalloc_mbstowcs(
const char* const locale
,const char* const source
,wchar_t** const destination
) {
const char* old_locale;
enum EncodingConversionError err;

old_locale=setlocale(LC_CTYPE, locale);
if (!old_locale) return ECE_ENCODING_UNSUPPORTED;

err=alloc_mbstowcs(source, destination);

setlocale(LC_CTYPE,old_locale);
return err;
}
enum EncodingConversionError alloc_wcstombs(
const wchar_t* const source
,char** const destination
) {
enum EncodingConversionError err;
size_t src_size, dest_size;
char* dest;

*destination=NULL;
if (!source) return ECE_OK;

src_size = wcslen(source);
if ((err=mbslen_of_wcs(source, src_size, &dest_size)))
return err;

dest = malloc(sizeof(char)*(dest_size+1));
if (!dest) return ECE_OUT_OF_MEMORY;
{
size_t dest_final_size;
dest_final_size=wcstombs(dest, source, dest_size);

if (dest_final_size != dest_size) {
free(dest);
if (dest_final_size == (size_t)-1) return ECE_CHARACTER_UNSUPPORTED;
else return ECE_UNEXPECTED;
}
}
dest[dest_size]=0;
*destination=dest;
return ECE_OK;
}

enum EncodingConversionError lalloc_wcstombs(
const char* const locale
,const wchar_t* const source
,char** const destination
) {
const char* old_locale;
enum EncodingConversionError err;

old_locale=setlocale(LC_CTYPE, locale);
if (!old_locale) return ECE_ENCODING_UNSUPPORTED;

err=alloc_wcstombs(source, destination);

setlocale(LC_CTYPE,old_locale);
return err;
}
enum EncodingConversionError lalloc_mbstombs(
const char* const source_encoding
,const char* const destination_encoding
,const char* const source
,char** const destination)
{
enum EncodingConversionError err;
wchar_t* wcs;
if ((err=lalloc_mbstowcs(source_encoding, source, &wcs)))
return err;
err=lalloc_wcstombs(destination_encoding, wcs, destination);
free(wcs);
return err;
}

int main() {
char* dest;
enum EncodingConversionError err=ECE_OK;
char buffer[256];

buffer[256]='\0';
while (fgets(buffer, 255, stdin)
&& ECE_OK==(err=lalloc_mbstombs("zh_CN.GB2312", "zh_CN.UTF-8", buffer, &dest))
) {
printf("%s", dest);
free(dest);
}
if (err!=ECE_OK) {
fprintf(stderr, "%s", TranslateEncodingConversionError(err));
return 1;
}
return 0;
}



Writing a function that converts from/to quoted-printable encoding should be easy, if you need it.

efimka
December 31st, 2006, 08:24 AM
Hi SuperKoko !

I really appreciate your help.
I believe that it will help me to solve the problem.
I will try to implement your code into program running under transaction server CICS of IBM that we are using and let you know how it works.

Thank you.