やんちのプログラミングメモ　EUC-JP to UTF-16 example

EUC-JP to UTF-16 example
EUC-JP to UTF-16 への文字コード変換のサンプル実装。
MinGW + gcc 3.4.2 環境において動作確認してあります。

以下の例は、EucJpToUtf16()の実装の例です。 [euc_jp_to_utf16_c.zip]
Example Code
/*
 * file:euc_jp_to_utf16.c
 */
#define STRICT
#include <locale.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <wchar.h>
#include "euc_jp_to_utf16_table.h"
#define DUMMY_CODE (0xff1f)
#ifndef min
#define min(a,b) ((a)<(b)?(a):(b))
#endif
#define DEFAULT_BUFF_LEN 2048

int EucJpToUtf16(wchar_t *dest, size_t dest_size, char *src, size_t src_size);

int main(int argc, char *argv[])
{
    char        inpStr[DEFAULT_BUFF_LEN + 1];
    wchar_t     outStr[DEFAULT_BUFF_LEN + 1];
    FILE        *pInpFile;
    FILE        *pOutFile;
    long        sizeInpFile;
    size_t      countRead;
    size_t      sizeRead;
    int         iResult = 0;
    wchar_t     bom;
    
    _wsetlocale(LC_ALL, L"japanese");
    
    /*
     * ファイルより文字列を読み込み
     */
    memset(inpStr, 0, sizeof(inpStr));
    pInpFile = fopen("data.txt", "rb");
    if (pInpFile == NULL)
    {
        fwprintf(stderr, L"ファイルが開けません: data.txt");
        return 1;
    }
    fseek(pInpFile, 0, SEEK_END);
    sizeInpFile = ftell(pInpFile);
    fseek(pInpFile, 0, SEEK_SET);
    countRead = fread(inpStr, min(sizeInpFile, DEFAULT_BUFF_LEN), 1, pInpFile);
    fclose(pInpFile);
    if (!countRead)
    {
        fwprintf(stderr, L"ファイル読み込み失敗: data.txt");
        return 1;
    }
    
    sizeRead = min(sizeInpFile, DEFAULT_BUFF_LEN) * countRead;
    
    /*
     * euc_jp:inpStr -> utf16:outStr 変換。
     */
    memset(outStr, 0, sizeof(outStr));
    iResult = EucJpToUtf16(outStr, DEFAULT_BUFF_LEN, inpStr, sizeRead);
    if (iResult == (-1))
    {
        fprintf(stderr, "EucJpToUtf16() Failed.\n");
        return 1;
    }
    wprintf(L"%d 文字変換しました。\n", iResult);
    
    /*
     * ファイルに文字列を書き込み。
     */
    pOutFile = fopen("out.txt", "wb");
    if (pOutFile == NULL)
    {
        fwprintf(stderr, L"ファイルが開けません: out.txt");
        return 1;
    }
    bom = 0xfeff;
    fwrite((char *)&bom, sizeof(wchar_t), 1, pOutFile);
    fwrite(outStr, iResult * sizeof(wchar_t), 1, pOutFile);
    fclose(pOutFile);
    
    return 0;
}

/**
 * 文字コードをEUC-JPよりUTF-16へと変換。
 * 
 * @param[out] dest 出力文字列UTF-16
 * @param[in]  dest_size destの文字数
 * @param[in]  src 入力文字列EUC-JP
 * @param[in]  src_size 入力文字列のバイト数
 * 
 * @return 成功時には出力文字列の文字数を戻します。
 *         dest_size に0を指定し、こちらの関数を呼び出すと、変換された
 *         文字列を格納するのに必要なdestの文字数を戻します。
 *         関数が失敗した場合には、(-1)を戻します。
 */
int EucJpToUtf16(wchar_t *dest, size_t dest_size, char *src, size_t src_size)
{
    const int           nMaxReadSize        = 3;
    int                 countNeedsWords     = 0;
    int                 cursor              = 0;
    int                 nReadDataSize       = 0;
    unsigned char       chBuffer[3];
    unsigned char       ch1                 = 0;
    unsigned char       ch2                 = 0;
    unsigned char       ch3                 = 0;
    unsigned long       euc_jp_code         = 0;
    unsigned long       firstIndex          = 0;
    unsigned long       secondIndex         = 0;
    unsigned long       thirdIndex          = 0;
    int                 sizeBytes           = 0;
    unsigned long       unicode             = 0;
    
    /*
     * 入力パラメータをチェック
     */
    if (dest_size)
    {
        /* dest_size != 0 */
        if (dest == NULL)
        {
            /* Error -- Null Pointer Exception : dest */
            return (-1);
        }
        if (dest_size < 0)
        {
            /* Error -- dest_size < 0 */
            return (-1);
        }
    }
    if (src == NULL)
    {
        /* Error -- Null Pointer Exception : src */
        return (-1);
    }
    if (src_size < 1)
    {
        /* Error -- src_size < 1 */
        return (-1);
    }
    
    countNeedsWords = 0;
    for (cursor = 0; cursor < src_size;)
    {
        /* src より3バイトのデータを読み出し */
        nReadDataSize = (nMaxReadSize < (src_size - cursor))?(nMaxReadSize):(src_size - cursor);
        memcpy(chBuffer, (src + cursor), nReadDataSize);
        memset(chBuffer + nReadDataSize, 0, sizeof(chBuffer) - nReadDataSize);
        
        ch1 = *chBuffer;
        ch2 = *(chBuffer + 1);
        ch3 = *(chBuffer + 2);
        /* data size の調べる */
        if (ch1 <= 0x7f)
        {
            /*
             * <制御コード>0x00～0x1f, 0x7f</制御コード>
             * <ASCII文字>0x20～0x7e</ASCII文字>
             */
            sizeBytes = 1;
        }
        else if (0x8f != ch1)
        {
            euc_jp_code = ch1;
            euc_jp_code <<= 8;
            euc_jp_code |= ch2;
            if  (
                    (0x8ea1 <= euc_jp_code && euc_jp_code <= 0x8edf)
                    ||
                    (0xa1a1 <= euc_jp_code && euc_jp_code <= 0xf4a6)
                )
            {
                /*
                 * <半角カタカナ>0x8ea1～0x8edf</半角カタカナ>
                 * <漢字>0xa1a1～0xf4a6</漢字>
                 */
                sizeBytes = 2;
            }
            else
            {
                /* error(ここに出現してはいけないコード) */
                return (-1);
            }
        }
        else
        {
            euc_jp_code = ch1;
            euc_jp_code <<= 8;
            euc_jp_code |= ch2;
            euc_jp_code <<= 8;
            euc_jp_code |= ch3;
            if (0x8fa2af <= euc_jp_code && euc_jp_code <= 0x8fede3)
            {
                /*
                 * <漢字>0x8fa2af～0x8fede3</漢字>
                 */
                sizeBytes = 3;
            }
            else
            {
                /* error(ここに出現してはいけないコード) */
                return (-1);
            }
        }
        
        /*
         * dest_size をチェック
         */
        if (dest_size && (dest_size < (countNeedsWords + 1)))
        {
            /* Error : memory is not enough for dest */
            return countNeedsWords;
        }
        
        /* sizeBytes毎に処理を分岐 */
        if (dest_size)
        {
            unicode = DUMMY_CODE;
            switch (sizeBytes)
            {
            case 1:
                euc_jp_code = ch1;
                firstIndex = ch1;
                if (euc_jp_to_utf16_table[firstIndex].byType != 3)
                {
                    break;
                }
                unicode = euc_jp_to_utf16_table[firstIndex].dwUtf16Code;
                break;
            case 2:
                euc_jp_code = ch1;
                euc_jp_code <<= 8;
                euc_jp_code |= ch2;
                firstIndex = ch1;
                if (euc_jp_to_utf16_table[firstIndex].byType != 2)
                {
                    break;
                }
                secondIndex = euc_jp_to_utf16_table[firstIndex].dwBitmapIndex + ch2;
                if (euc_jp_to_utf16_table[secondIndex].byType != 3)
                {
                    break;
                }
                unicode = euc_jp_to_utf16_table[secondIndex].dwUtf16Code;
                break;
            case 3:
                euc_jp_code = ch1;
                euc_jp_code <<= 8;
                euc_jp_code |= ch2;
                euc_jp_code <<= 8;
                euc_jp_code |= ch3;
                firstIndex = ch1;
                if (euc_jp_to_utf16_table[firstIndex].byType != 2)
                {
                    break;
                }
                secondIndex = euc_jp_to_utf16_table[firstIndex].dwBitmapIndex + ch2;
                if (euc_jp_to_utf16_table[secondIndex].byType != 2)
                {
                    break;
                }
                thirdIndex = euc_jp_to_utf16_table[secondIndex].dwBitmapIndex + ch3;
                if (euc_jp_to_utf16_table[thirdIndex].byType != 3)
                {
                    break;
                }
                unicode = euc_jp_to_utf16_table[thirdIndex].dwUtf16Code;
                break;
            }
            *dest = unicode;
            dest++;
        }
        countNeedsWords++;
        cursor += sizeBytes;
    }
    
    return countNeedsWords;
}
・2008-10-01
ページのTOPへ
フロントページへ
ページのTOPへ