vc++读取文本文件并自动识别编码且转换

来源:赵克立博客分类: C/C++ 标签：C/C++发布时间:2022-10-06 11:50:44最后更新:2022-10-06 11:50:44浏览:529

本文为博主原创文章，转载请声明原文链接...谢谢。o_0。

原文链接:

http://www.zhaokeli.com /article/8747.html

更新时间：

2022-10-06 11:50:44

温馨提示：

学无止境,技术类文章有它的时效性,请留意文章更新时间,如发现内容有误请留言指出,防止别人"踩坑",我会及时更新文章

窄字符到宽字符转换

// string => wstring
std::wstring Utils::String2WString(const std::string& str, int m_encode)
{
	wstring result;
	//获取缓冲区大小，并申请空间，缓冲区大小按字符计算  
	const int len = MultiByteToWideChar(m_encode, 0, str.c_str(), (int)str.size(), nullptr, 0);
	const auto buffer = new TCHAR[len + 1];
	//多字节编码转换成宽字节编码  
	MultiByteToWideChar(m_encode, 0, str.c_str(), (int)str.size(), buffer, len);
	//添加字符串结尾  
	buffer[len] = '\0';
	//删除缓冲区并返回值  
	result.append(buffer);
	delete[] buffer;
	return result;
}

utf8字符串判断

读取不带bom头的utf8时需要此函数来识别

bool Utils::IsUTF8Text(const void* pBuffer, const long size)
{
	bool IsUTF8 = true;
	auto start = (unsigned char*)pBuffer;
	const unsigned char* end = (unsigned char*)pBuffer + size;
	while (start < end)
	{
		if (*start < 0x80) // (10000000): 值小于0x80的为ASCII字符    
		{
			start++;
		}
		else if (*start < (0xC0)) // (11000000): 值介于0x80与0xC0之间的为无效UTF-8字符    
		{
			IsUTF8 = false;
			break;
		}
		else if (*start < (0xE0)) // (11100000): 此范围内为2字节UTF-8字符    
		{
			if (start >= end - 1)
			{
				break;
			}

			if ((start[1] & (0xC0)) != 0x80)
			{
				IsUTF8 = false;
				break;
			}

			start += 2;
		}
		else if (*start < (0xF0)) // (11110000): 此范围内为3字节UTF-8字符    
		{
			if (start >= end - 2)
			{
				break;
			}

			if ((start[1] & (0xC0)) != 0x80 || (start[2] & (0xC0)) != 0x80)
			{
				IsUTF8 = false;
				break;
			}

			start += 3;
		}
		else
		{
			IsUTF8 = false;
			break;
		}
	}

	return IsUTF8;
}

读取文本

CDuiString Utils::FileGetContents(const CDuiString& filePath)
{
	FILE* pFile = _tfopen(filePath.GetData(), _T("rb"));
	if (pFile == nullptr) {
		//Log::Error(filePath);
		return{};
	}
	// 取三个字节查看文件类型
	char szFlag[3] = { 0 };
	fread(szFlag, 1, 3, pFile);
	// 取数据总长度
	fseek(pFile, 0L, SEEK_END);
	const int total = ftell(pFile);
	int offset = 0;

	//https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
	//FE FF UTF16BE
	//FF FE UTF16LE
	//EF BB BF UTF8
	int encode = CP_ACP;
	if ((unsigned char)szFlag[0] == 0xFF
		&& (unsigned char)szFlag[1] == 0xFE)
	{
		//Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications
		encode = 1201;
		offset = 2;
	}
	else if ((unsigned char)szFlag[0] == 0xFE
		&& (unsigned char)szFlag[1] == 0xFF)
	{
		//	Unicode UTF-16, big endian byte order; available only to managed applications
		encode = 1200;
		offset = 2;
	}
	else if ((unsigned char)szFlag[0] == 0xEF
		&& (unsigned char)szFlag[1] == 0xBB
		&& (unsigned char)szFlag[2] == 0xBF)
	{
		encode = CP_UTF8;
		offset = 3;
	}

	fseek(pFile, offset, SEEK_SET);
	const auto buf = new  char[total - offset + 1]{ 0 };
	fread(buf, sizeof(char), total - offset, pFile);
	const string strContent = buf;
	delete[] buf;
	fclose(pFile);

	// 无utf8 bom头的字符串再判断下
	if (encode == CP_ACP && IsUTF8Text(strContent.c_str(), strContent.length()))
	{
		encode = CP_UTF8;
	}
	CDuiString tmp = String2WString(strContent, encode).c_str();
	return tmp;
}