WideCharToMultiByte Simple Character Encoding Detection

1. 了解下这个 API
//z 2014-03-25 08:18:41 IS2120@BG57IV3 T3343244181.K.F1434403198[T1,L68,R2,V15]
void UnicodeToAnsi(WCHAR *in, char *out, int cchout)
{
int len ;

len = WideCharToMultiByte(CP_ACP,
0,
in,
wcslen(in)+1,
out,
cchout,
NULL,
NULL) ;
if (!len)
ErrorExit("out of memory") ;
}

//z 2014-04-14 22:04:51 IS2120@BG57IV3 T1381068076.K.F1547169058[T4,L105,R3,V66]
2. 一个例子，将文件自动转换为 utf-8

// ChangeFileEncoding.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"
#include "ChangeFileEncoding.h"
#include <string>

#ifdef _DEBUG
#define new DEBUG_NEW
#endif


// 唯一的应用程序对象

CWinApp theApp;

using namespace std;

void recursiveFile(CString strFileType);
void convertGBToUTF8(CString strWritePath, const char* gb2312);

int _tmain(int argc, TCHAR* argv[], TCHAR* envp[])
{
	int nRetCode = 0;

	// 初始化 MFC 并在失败时显示错误
	if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0))
	{
		// TODO: 更改错误代码以符合您的需要
		_tprintf(_T("错误: MFC 初始化失败
"));
		nRetCode = 1;
	}
	else
	{
		/*for(int i = 0; i < argc; i++)
		{
			MessageBox(NULL, argv[i], L"Arglist contents", MB_OK);
		}*/
		//声明一个CFileFind类变量，以用来搜索
		
		//接受一个参数作为源代码文件的根目录
		TCHAR *lpszDirName = argv[1];
		CString strFileType;
		strFileType.Format(_T("%s\*.*"), lpszDirName);
		//递归此目录下的.h文件和.cpp文件，如果发现不是utf8编码则转换为utf8编码
		recursiveFile(strFileType);
		
	}

	return nRetCode;
}

void recursiveFile( CString strFileType)
{
	CFileFind finder; 
	BOOL isFinded = finder.FindFile(strFileType);//查找第一个文件
	while(isFinded)
	{
		isFinded = finder.FindNextFile(); //递归搜索其他的文件
		if(!finder.IsDots()) //如果不是"."目录
		{
			CString strFoundFile = finder.GetFilePath(); 
			if(finder.IsDirectory()) //如果是目录，则递归地调用
			{ 
				CString strNextFileType;
				strNextFileType.Format(_T("%s\*.*"), strFoundFile);
				recursiveFile(strNextFileType);
			}
			else
			{ 
				//如果是头文件或cpp文件
				if(strFoundFile.Right(4) == _T(".cpp") || strFoundFile.Right(2) == _T(".h")) {
					CFile fileReader(strFoundFile, CFile::modeRead);
					byte head[3];
					fileReader.Read(head, 3); 
					//判断是否带有BOM文件头
					if(head[0] == 0xef && head[1]==0xbb && head[2] == 0xbf )
					{
						fileReader.Close();
						continue;
					}
					fileReader.SeekToBegin();
					
					int bufLength = 256;
					char *buf = new char[bufLength];
					ZeroMemory(buf, bufLength);
					int nReadLength;
					std::string strContent;
					while((nReadLength = fileReader.Read(buf, bufLength)))
					{
						strContent.append(buf, nReadLength);
						ZeroMemory(buf, nReadLength);
					}
					delete buf; 
					fileReader.Close();
					convertGBToUTF8(strFoundFile, strContent.c_str());
				}
			}
		}
	}
	finder.Close();
}

void convertGBToUTF8(CString strWritePath, const char* gb2312)
{
	CFile fp;
	fp.Open(strWritePath, CFile::modeCreate|CFile::modeWrite|CFile::typeBinary,NULL);
	int len = MultiByteToWideChar(CP_ACP, 0, gb2312, -1, NULL, 0);
	wchar_t* wstr = new wchar_t[len+1];
	memset(wstr, 0, len+1);
	MultiByteToWideChar(CP_ACP, 0, gb2312, -1, wstr, len);
	len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, NULL, 0, NULL, NULL);
	char* str = new char[len+1];
	memset(str, 0, len+1);
	len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, str, len, NULL, NULL);
	if(wstr) delete[] wstr;
	str[len] = '
';
	const unsigned char aryBOM[]  = {0xEF, 0xBB, 0xBF};
	fp.Write(aryBOM, sizeof(aryBOM));
	fp.Write(str,len);
	delete[] str;
	fp.Close();
}

//z 2014-04-14 22:04:51 IS2120@BG57IV3 T1381068076.K.F1547169058[T4,L105,R3,V66]
http://blog.csdn.net/visualcatsharp/article/details/7345854

//z 2014-05-06 12:00:46 L.239'43154 BG57IV3@XCL T1109932947.K.F253293061 [T409,L5358,R263,V7006]
3. v2

// ConvertZ.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"
#include "ConvertZ.h"
#include <string>

using namespace std;

#ifdef _DEBUG
#define new DEBUG_NEW
#endif

// 唯一的应用程序对象
CWinApp theApp;

void recursiveFile(CString strFileType);
void convertGBToUTF8(CString strWritePath, const char* gb2312);

int _tmain(int argc, TCHAR* argv[], TCHAR* envp[])
{
	int nRetCode = 0;

	// 初始化 MFC 并在失败时显示错误
	if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0))
	{
		// TODO: 更改错误代码以符合您的需要
		_tprintf(_T("错误: MFC 初始化失败
"));
		nRetCode = 1;
	}
	else
	{
		/*for(int i = 0; i < argc; i++)
		{
		MessageBox(NULL, argv[i], L"Arglist contents", MB_OK);
		}*/
		//声明一个CFileFind类变量，以用来搜索

		if(argc != 2)
		{
			CString strUsage;
			strUsage.Format(_T("usage : 
    %s dir
    dir [sample] : c:\src
"),argv[0]);
			_tprintf(strUsage.GetBuffer());
			strUsage.ReleaseBuffer();

			return nRetCode;
		}

		//接受一个参数作为源代码文件的根目录
		TCHAR *lpszDirName = argv[1];
		CString strFileType;
		strFileType.Format(_T("%s\*.*"), lpszDirName);
		//递归此目录下的.h文件和.cpp文件，如果发现不是utf8编码则转换为utf8编码
		recursiveFile(strFileType);
	}

	return nRetCode;
}

bool isSrcType(const CString strFileType)
{
	CString strExt_R4 = strFileType.Right(4);
	CString strExt_R2 = strFileType.Right(2);

	if ((strExt_R4.CompareNoCase(_T(".cpp")) == 0)
		|| (strExt_R2.CompareNoCase(_T(".c")) == 0)
		|| (strExt_R2.CompareNoCase(_T(".h")) == 0)
		|| (strExt_R4.CompareNoCase(_T(".cxx")) == 0)
		|| (strExt_R4.CompareNoCase(_T(".hpp")) == 0)
		)
	{
		return true;
	}

	return false;
}

void recursiveFile( CString strFileType)
{
	CFileFind finder; 
	BOOL isFinded = finder.FindFile(strFileType);//查找第一个文件
	while(isFinded)
	{
		isFinded = finder.FindNextFile(); //递归搜索其他的文件
		if(!finder.IsDots()) //如果不是"."目录
		{
			CString strFoundFile = finder.GetFilePath(); 
			if(finder.IsDirectory()) //如果是目录，则递归地调用
			{ 
				CString strNextFileType;
				strNextFileType.Format(_T("%s\*.*"), strFoundFile);
				recursiveFile(strNextFileType);
			}
			else
			{ 
				//如果是头文件或cpp文件
				if(isSrcType(strFoundFile)) {
					CFile fileReader(strFoundFile, CFile::modeRead|CFile::typeBinary);
					byte head[3];
					fileReader.Read(head, 3); 
					//判断是否带有BOM文件头
					if(head[0] == 0xef && head[1]==0xbb && head[2] == 0xbf )
					{
						fileReader.Close();
						continue;
					}
					fileReader.SeekToBegin();

					int bufLength = 256;
					char *buf = new char[bufLength];
					ZeroMemory(buf, bufLength);
					int nReadLength;
					std::string strContent;
					while((nReadLength = fileReader.Read(buf, bufLength)))
					{
						strContent.append(buf, nReadLength);
						ZeroMemory(buf, nReadLength);
					}
					delete buf; 
					fileReader.Close();
					convertGBToUTF8(strFoundFile, strContent.c_str());
				}
			}
		}
	}
	finder.Close();
}

void convertGBToUTF8(CString strWritePath, const char* gb2312)
{
	CFile fp;
	fp.Open(strWritePath, CFile::modeCreate|CFile::modeWrite|CFile::typeBinary,NULL);

	const int ngblen = static_cast<int>(strlen(gb2312));
	int len = MultiByteToWideChar(CP_ACP, 0, gb2312, ngblen, NULL, 0);
	wchar_t* wstr = new wchar_t[len+1];
	memset(wstr, 0, (len+1)*sizeof(wchar_t));
	MultiByteToWideChar(CP_ACP, 0, gb2312, ngblen, wstr, len);
	wstr[len] = ' ';

	int newLen = 0;
	newLen = WideCharToMultiByte(CP_UTF8, 0, wstr, len, NULL, 0, NULL, NULL);
	char* str = new char[newLen+1];
	memset(str, 0, (newLen+1)*sizeof(char));
	newLen = WideCharToMultiByte(CP_UTF8, 0, wstr, len, str, newLen, NULL, NULL);

	if(wstr)
	{
		delete[] wstr;
		wstr = NULL;
	}

	str[newLen] = ' ';
	const unsigned char aryBOM[]  = {0xEF, 0xBB, 0xBF};
	fp.Write(aryBOM, sizeof(aryBOM));
	fp.Write(str,newLen);
	delete[] str;
	fp.Close();
}

//z 2014-05-22 16:55:50 L.223'25450 BG57IV3 T427209771 .K.F253293061 [T484,L6693,R325,V8206]

Introduction

One very commonly asked question in programming is how to detect the character encoding of a string. Well, I'm going to share a cool method I came up with that can detect if a string is UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, or UTF-32LE in just 4 lines of code.

Explanation

We'll be working with null terminated strings, so the first rule is that we must terminate all strings with a quadruple null, regardless of encoding. You may wish to add a definition such as the following:

Copy Code

WideCharToMultiByte Simple Character Encoding Detection

Introduction

Explanation

相关推荐