環境
windows 11 64 bits
Visual Studio 2022
只給我一個 txt 檔,我要怎麼知道它裡面是用什麼編碼方式呢?
這裡針對 windows 記事本提供的編碼方式做判斷。這篇文章先討論 UTF-16 LE、UTF-16 BE、使用 BOM 的 UTF-8 這 3 種,另外兩個放在下一篇。
先將以下內容存成三種編碼格式
哈利波特_해리 포터_Harry Potter_ハリー・ポッター |
1. UTF-16 LE
檔頭是 FF FE
2.UTF-16 BE
檔頭是 FE FF
3.使用 BOM 的 UTF-8
檔頭是 EF BB BF
#include <iostream>
#include <fstream>
using namespace std;
enum class FileEncode
{
UNKNOWN = 0,
ANSI_BIG5, //1
UTF8, //2
UTF8_BOM, //3
UTF16_LE, //4 預設的 UNICODE
UTF16_BE, //5
};
void PrintTxtEncode(FileEncode nEncode)
{
switch (nEncode)
{
case FileEncode::UNKNOWN:
cout << "UNKNOWN" << endl;
break;
case FileEncode::ANSI_BIG5:
cout << "ANSI_BIG5" << endl;
break;
case FileEncode::UTF8:
cout << "UTF8" << endl;
break;
case FileEncode::UTF8_BOM:
cout << "UTF8_BOM" << endl;
break;
case FileEncode::UTF16_LE:
cout << "UTF16_LE" << endl;
break;
case FileEncode::UTF16_BE:
cout << "UTF16_BE" << endl;
break;
}
}
FileEncode GetTxtEncode(const char* szPath)
{
const int MAX_READ_NUM = 32;
char buffer[MAX_READ_NUM] = "";
unsigned char temp[3] = "";
unsigned char uniTxt[] = { 0xFF, 0xFE }; // Unicode 檔頭
unsigned char endianTxt[] = { 0xFE, 0xFF }; // Unicode big endian 檔頭
unsigned char utf8bomTxt[] = { 0xEF, 0xBB, 0xBF }; // utf-8 with BOM 檔頭
FileEncode nEncode = FileEncode::UNKNOWN;
ifstream filestr;
filestr.open(szPath, ios::binary);
if (!filestr.is_open())
return FileEncode::UNKNOWN;
filestr.read(buffer, MAX_READ_NUM);
filestr.close();
temp[0] = (unsigned char)buffer[0];
temp[1] = (unsigned char)buffer[1];
temp[2] = (unsigned char)buffer[2];
if (temp[0] == uniTxt[0] && temp[1] == uniTxt[1])
nEncode = FileEncode::UTF16_LE;
else if (temp[0] == endianTxt[0] && temp[1] == endianTxt[1])
nEncode = FileEncode::UTF16_BE;
else if (temp[0] == utf8bomTxt[0] && temp[1] == utf8bomTxt[1] && temp[2] == utf8bomTxt[2])
nEncode = FileEncode::UTF8_BOM;
else
{
//ANSI_BIG5 和 UTF8 較麻煩,下一篇討論
}
return nEncode;
}
void CheckTxtEncode(const char* szPath)
{
cout << szPath << endl;
FileEncode nEncode = GetTxtEncode(szPath);
PrintTxtEncode(nEncode);
cout << endl;
}
int main()
{
CheckTxtEncode("UTF-16_LE.txt");
CheckTxtEncode("UTF-16_BE.txt");
CheckTxtEncode("UTF-8_BOM.txt");
system("pause");
return 0;
}
下一篇:C++ -讀取 txt 文字檔 區分 big5 和 utf8 檔案
https://husking-studio.com/cpp-txt-file-05/