概述
windows的C++工程一般为unicode字符集和多字节字符集,在中文目录下调用接口时经常会碰到字符转换的问题。整理了如下几个函数
- AnsiToUnicode
- UnicodeToAnsi
- UnicodeToUtf8
- Utf8ToUnicode
- AnsiToUtf8
- Utf8ToAnsi
- IsUTF8
函数实现
// 宽字符转UTF8
std::string UnicodeToUtf8Str(const std::wstring& in)
{
std::string s(in.length() * 3 + 1, ' ');
size_t len = ::WideCharToMultiByte(CP_UTF8, 0, in.c_str(), in.length(), &s[0], s.length(), NULL, NULL);
s.resize(len);
return s;
}
// 宽字符转ansi
std::string UnicodeToAnsiStr(const std::wstring& in)
{
std::string s(in.length() * 2 + 1, ' ');
size_t len = ::WideCharToMultiByte(CP_ACP, 0, in.c_str(), in.length(), &s[0], s.length(), NULL, NULL);
s.resize(len);
return s;
}
// ansi 转宽字符
std::wstring AnsiToUnicodeWStr(const std::string& in)
{
std::wstring s(in.length(), _T(' '));
size_t len = ::MultiByteToWideChar(CP_ACP, 0, in.c_str(), in.length(), &s[0], s.length());
s.resize(len);
return s;
}
// UTF8 转宽字符
std::wstring Utf8ToUnicodeWStr(const std::string& in)
{
std::wstring s(in.length(), _T(' '));
size_t len = ::MultiByteToWideChar(CP_UTF8, 0, in.c_str(), in.length(), &s[0], s.length());
s.resize(len);
return s;
}
//ansi 转utf8
std::string AnsiToUtf8Str(const std::string& in)
{
std::wstring wstr = AnsiToUnicodeWStr(in);
return UnicodeToUtf8Str(wstr);
}
//utf8 转ansi
std::string Utf8ToAnsiStr(const std::string& in)
{
std::wstring wstr = Utf8ToUnicodeWStr(in);
return UnicodeToAnsiStr(wstr);
}
// 判断传入字符串是否为utf-8
bool IsUTF8(const void* pBuffer, long size)
{
bool IsUTF8 = true;
unsigned char* start = (unsigned char*)pBuffer;
unsigned char* end = (unsigned char*)pBuffer + size;
while (start < end)
{
if (*start < 0x80) // (10000000): 值小于0x80的为ASCII字符
{
start++;
}
else if (*start < (0xC0)) // (11000000): 值介于0x80与0xC0之间的为无效UTF-8字符
{
IsUTF8 = false;
break;
}
else if (*start < (0xE0)) // (11100000): 此范围内为2字节UTF-8字符
{
if (start >= end - 1)
{
break;
}
if ((start[1] & (0xC0)) != 0x80)
{
IsUTF8 = false;
break;
}
start += 2;
}
else if (*start < (0xF0)) // (11110000): 此范围内为3字节UTF-8字符
{
if (start >= end - 2)
{
break;
}
if ((start[1] & (0xC0)) != 0x80 || (start[2] & (0xC0)) != 0x80)
{
IsUTF8 = false;
break;
}
start += 3;
}
else
{
IsUTF8 = false;
break;
}
}
return IsUTF8;
}
|