vc++读取文本文件并自动识别编码且转换
版权声明:
本文为博主原创文章,转载请声明原文链接...谢谢。o_0。
更新时间:
2022-10-06 11:50:44
温馨提示:
学无止境,技术类文章有它的时效性,请留意文章更新时间,如发现内容有误请留言指出,防止别人"踩坑",我会及时更新文章
窄字符到宽字符转换
// string => wstring
std::wstring Utils::String2WString(const std::string& str, int m_encode)
{
wstring result;
//获取缓冲区大小,并申请空间,缓冲区大小按字符计算
const int len = MultiByteToWideChar(m_encode, 0, str.c_str(), (int)str.size(), nullptr, 0);
const auto buffer = new TCHAR[len + 1];
//多字节编码转换成宽字节编码
MultiByteToWideChar(m_encode, 0, str.c_str(), (int)str.size(), buffer, len);
//添加字符串结尾
buffer[len] = '\0';
//删除缓冲区并返回值
result.append(buffer);
delete[] buffer;
return result;
}utf8字符串判断
读取不带bom头的utf8时需要此函数来识别
bool Utils::IsUTF8Text(const void* pBuffer, const long size)
{
bool IsUTF8 = true;
auto start = (unsigned char*)pBuffer;
const unsigned char* end = (unsigned char*)pBuffer + size;
while (start < end)
{
if (*start < 0x80) // (10000000): 值小于0x80的为ASCII字符
{
start++;
}
else if (*start < (0xC0)) // (11000000): 值介于0x80与0xC0之间的为无效UTF-8字符
{
IsUTF8 = false;
break;
}
else if (*start < (0xE0)) // (11100000): 此范围内为2字节UTF-8字符
{
if (start >= end - 1)
{
break;
}
if ((start[1] & (0xC0)) != 0x80)
{
IsUTF8 = false;
break;
}
start += 2;
}
else if (*start < (0xF0)) // (11110000): 此范围内为3字节UTF-8字符
{
if (start >= end - 2)
{
break;
}
if ((start[1] & (0xC0)) != 0x80 || (start[2] & (0xC0)) != 0x80)
{
IsUTF8 = false;
break;
}
start += 3;
}
else
{
IsUTF8 = false;
break;
}
}
return IsUTF8;
}读取文本
CDuiString Utils::FileGetContents(const CDuiString& filePath)
{
FILE* pFile = _tfopen(filePath.GetData(), _T("rb"));
if (pFile == nullptr) {
//Log::Error(filePath);
return{};
}
// 取三个字节查看文件类型
char szFlag[3] = { 0 };
fread(szFlag, 1, 3, pFile);
// 取数据总长度
fseek(pFile, 0L, SEEK_END);
const int total = ftell(pFile);
int offset = 0;
//https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
//FE FF UTF16BE
//FF FE UTF16LE
//EF BB BF UTF8
int encode = CP_ACP;
if ((unsigned char)szFlag[0] == 0xFF
&& (unsigned char)szFlag[1] == 0xFE)
{
//Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications
encode = 1201;
offset = 2;
}
else if ((unsigned char)szFlag[0] == 0xFE
&& (unsigned char)szFlag[1] == 0xFF)
{
// Unicode UTF-16, big endian byte order; available only to managed applications
encode = 1200;
offset = 2;
}
else if ((unsigned char)szFlag[0] == 0xEF
&& (unsigned char)szFlag[1] == 0xBB
&& (unsigned char)szFlag[2] == 0xBF)
{
encode = CP_UTF8;
offset = 3;
}
fseek(pFile, offset, SEEK_SET);
const auto buf = new char[total - offset + 1]{ 0 };
fread(buf, sizeof(char), total - offset, pFile);
const string strContent = buf;
delete[] buf;
fclose(pFile);
// 无utf8 bom头的字符串再判断下
if (encode == CP_ACP && IsUTF8Text(strContent.c_str(), strContent.length()))
{
encode = CP_UTF8;
}
CDuiString tmp = String2WString(strContent, encode).c_str();
return tmp;
}