C++ -讀取 txt 文字檔 取得檔案的編碼方式

環境
windows 11 64 bits
Visual Studio 2022

只給我一個 txt 檔,我要怎麼知道它裡面是用什麼編碼方式呢?

這裡針對 windows 記事本提供的編碼方式做判斷。這篇文章先討論 UTF-16 LE、UTF-16 BE、使用 BOM 的 UTF-8 這 3 種,另外兩個放在下一篇。

先將以下內容存成三種編碼格式

哈利波特_해리 포터_Harry Potter_ハリー・ポッター

1. UTF-16 LE

檔頭是 FF FE

2.UTF-16 BE

檔頭是 FE FF

3.使用 BOM 的 UTF-8

檔頭是 EF BB BF

#include <iostream>
#include <fstream>
using namespace std;

enum class FileEncode
{
    UNKNOWN = 0,
    ANSI_BIG5,  //1
    UTF8,       //2
    UTF8_BOM,   //3
    UTF16_LE,   //4	預設的 UNICODE
    UTF16_BE,   //5
};

void PrintTxtEncode(FileEncode nEncode)
{
    switch (nEncode)
    {
    case FileEncode::UNKNOWN:
        cout << "UNKNOWN" << endl;
        break;
    case FileEncode::ANSI_BIG5:
        cout << "ANSI_BIG5" << endl;
        break;
    case FileEncode::UTF8:
        cout << "UTF8" << endl;
        break;
    case FileEncode::UTF8_BOM:
        cout << "UTF8_BOM" << endl;
        break;
    case FileEncode::UTF16_LE:
        cout << "UTF16_LE" << endl;
        break;
    case FileEncode::UTF16_BE:
        cout << "UTF16_BE" << endl;
        break;
    }
}

FileEncode GetTxtEncode(const char* szPath)
{
    const int MAX_READ_NUM = 32;
    char buffer[MAX_READ_NUM] = "";
    unsigned char temp[3] = "";
    unsigned char uniTxt[] = { 0xFF, 0xFE };            // Unicode 檔頭
    unsigned char endianTxt[] = { 0xFE, 0xFF };         // Unicode big endian 檔頭
    unsigned char utf8bomTxt[] = { 0xEF, 0xBB, 0xBF };  // utf-8 with BOM 檔頭
    FileEncode nEncode = FileEncode::UNKNOWN;

    ifstream filestr;
    filestr.open(szPath, ios::binary);
    if (!filestr.is_open())
        return FileEncode::UNKNOWN;

    filestr.read(buffer, MAX_READ_NUM);
    filestr.close();

    temp[0] = (unsigned char)buffer[0];
    temp[1] = (unsigned char)buffer[1];
    temp[2] = (unsigned char)buffer[2];

    if (temp[0] == uniTxt[0] && temp[1] == uniTxt[1])
        nEncode = FileEncode::UTF16_LE;
    else if (temp[0] == endianTxt[0] && temp[1] == endianTxt[1])
        nEncode = FileEncode::UTF16_BE;
    else if (temp[0] == utf8bomTxt[0] && temp[1] == utf8bomTxt[1] && temp[2] == utf8bomTxt[2])
        nEncode = FileEncode::UTF8_BOM;
    else
    {
        //ANSI_BIG5 和 UTF8 較麻煩,下一篇討論
    }

    return nEncode;
}

void CheckTxtEncode(const char* szPath)
{
    cout << szPath << endl;
    FileEncode nEncode = GetTxtEncode(szPath);
    PrintTxtEncode(nEncode);
    cout << endl;
}

int main()
{
    CheckTxtEncode("UTF-16_LE.txt");
    CheckTxtEncode("UTF-16_BE.txt");
    CheckTxtEncode("UTF-8_BOM.txt");

    system("pause");
    return 0;
}

下一篇:C++ -讀取 txt 文字檔 區分 big5 和 utf8 檔案
https://husking-studio.com/cpp-txt-file-05/

發佈留言

發佈留言必須填寫的電子郵件地址不會公開。 必填欄位標示為 *