notepad--/Encode.cpp

400 lines
8.2 KiB
C++
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "Encode.h"
#include <QTextCodec>
#include <QtDebug>
/* 检查字符串编码的类。看了大量文献,结论如下:
*如果是UTF BOM格式或者UNICODE格式其文件头部前几个字节2-3有一定的标识。由此标识直接按对应编码处理。
*如果没有标识默认就是UTF8(NO BOM) 与 ANSI(现在只考虑GBK进行对比。
*此时需要做统计分析。对所有行进行UTF8解析如果按照UTF8解析错位再按照GBK解析。如果解析出GBK那么大概率认为文件是GBK编码的。
*/
Encode::Encode()
{
}
Encode::~Encode()
{
}
CODE_ID Encode::getCodeByName(QString name)
{
CODE_ID id;
if (name == "unknown")
{
id = CODE_ID::UNKOWN;
}
else if (name == "UTF16-LE")
{
id = CODE_ID::UNICODE_LE;
}
else if (name == "UTF16-BE")
{
id = CODE_ID::UNICODE_BE;
}
else if (name == "UTF8")
{
id = CODE_ID::UTF8_NOBOM;
}
else if (name == "UTF8-BOM")
{
id = CODE_ID::UTF8_BOM;
}
else if (name == "GBK")
{
id = CODE_ID::GBK;
}
else if (name == "EUC-JP")
{
id = CODE_ID::EUC_JP;
}
else if (name == "Shift-JIS")
{
id = CODE_ID::Shift_JIS;
}
else if (name == "EUC-KR")
{
id = CODE_ID::EUC_KR;
}
else if (name == "KOI8-R")
{
id = CODE_ID::KOI8_R;
}
else if (name == "TSCII")
{
id = CODE_ID::TSCII;
}
else if (name == "TIS-620")
{
id = CODE_ID::TIS_620;
}
else
{
id = CODE_ID::UNKOWN;
}
return id;
}
QString Encode::getLineEndById(RC_LINE_FORM id)
{
QString ret;
switch (id)
{
case PAD_LINE:
case UNKNOWN_LINE:
#ifdef WIN32
ret = "Windows(CR LF)";
#else
ret = "Unix(LF)";
#endif
ret = "NULL";
break;
case UNIX_LINE:
ret = "Unix(LF)";
break;
case DOS_LINE:
ret = "Windows(CR LF)";
break;
case MAC_LINE:
ret = "Mac(CR)";
break;
default:
break;
}
return ret;
}
QString Encode::getCodeNameById(CODE_ID id)
{
QString ret;
switch (id)
{
case UNKOWN:
ret = "unknown";
break;
case ANSI:
ret = "unknown";
break;
case UNICODE_LE:
ret = "UTF16-LE";
break;
case UNICODE_BE:
ret = "UTF16-BE";
break;
case UTF8_NOBOM:
ret = "UTF8";
break;
case UTF8_BOM:
ret = "UTF8-BOM";
break;
case GBK:
ret = "GBK";
break;
case EUC_JP:
ret = "EUC-JP";
break;
case Shift_JIS:
ret = "Shift-JIS";
break;
case EUC_KR:
ret = "EUC-KR";
break;
case KOI8_R:
ret = "KOI8-R";
break;
case TSCII:
ret = "TSCII";
break;
case TIS_620:
ret = "TIS-620";
break;
default:
ret = "unknown";
break;
}
return ret;
}
QByteArray Encode::getEncodeStartFlagByte(CODE_ID code)
{
QByteArray ret;
switch (code)
{
case UNICODE_LE:
{
ret.append((char)0xFF);
ret.append((char)0xFE);
}
break;
case UNICODE_BE:
{
ret.append((char)0xFE);
ret.append((char)0xFF);
}
break;
case UTF8_BOM:
{
ret.append((char)0xEF);
ret.append((char)0xBB);
ret.append((char)0xBF);
}
break;
default:
break;
}
return ret;
}
CODE_ID Encode::DetectEncode(const uchar* pBuffer, int length, int &skip)
{
if (pBuffer[0] == 0xFF && pBuffer[1] == 0xFE)
{
skip = 2;
return CODE_ID::UNICODE_LE; //skip 2
}
if (pBuffer[0] == 0xFE && pBuffer[1] == 0xFF)
{
skip = 2;
return CODE_ID::UNICODE_BE; //skip 2
}
if (pBuffer[0] == 0xEF && pBuffer[1] == 0xBB && pBuffer[2] == 0xBF)
{
skip = 3;
return CODE_ID::UTF8_BOM; //skip 3 with BOM
}
// 不能知道是不是UTF8
CODE_ID code = CheckUnicodeWithoutBOM(pBuffer, length);
skip = 0;
return code; //skip 0
}
bool Encode::tranGbkToUNICODE(const char* pText, int length, QString &out)
{
QTextCodec::ConverterState state;
QTextCodec *codec = QTextCodec::codecForName("GBK");
out = codec->toUnicode((const char *)pText, length, &state);
if (state.invalidChars > 0) {
return false;
}
return true;
}
bool Encode::tranUtf8ToUNICODE(const char* pText, int length, QString &out)
{
QTextCodec::ConverterState state;
QTextCodec *codec = QTextCodec::codecForName("UTF-8");
out = codec->toUnicode((const char *)pText, length, &state);
if (state.invalidChars > 0) {
return false;
}
return true;
}
//与getCodeNameById类似但是返回的是QT系统支持的编码的字符串名称
QString Encode::getQtCodecNameById(CODE_ID id)
{
QString ret;
switch (id)
{
case UNKOWN:
case ANSI:
ret = "unknown";
break;
case UNICODE_LE:
ret = "UTF16-LE";
break;
case UNICODE_BE:
ret = "UTF16-BE";
break;
case UTF8_NOBOM://qt没有这种
case UTF8_BOM:
ret = "UTF8";
break;
case GBK:
ret = "GBK";
break;
case EUC_JP:
ret = "EUC-JP";
break;
case Shift_JIS:
ret = "Shift-JIS";
break;
case EUC_KR:
ret = "EUC-KR";
break;
case KOI8_R:
ret = "KOI8-R";
break;
case TSCII:
ret = "TSCII";
break;
case TIS_620:
ret = "TIS-620";
break;
default:
ret = "unknown";
break;
}
return ret;
}
//将指定编码的字符串转换到unicode
bool Encode::tranStrToUNICODE(CODE_ID code, const char* pText, int length, QString &out)
{
if (length < 0)
{
return false;
}
QTextCodec::ConverterState state;
QTextCodec *codec = nullptr;
QString textCodeName = getQtCodecNameById(code);
if (textCodeName.isEmpty() || textCodeName == "unknown")
{
//对于其它非识别编码统一转换为utf8。减去让用户选择的麻烦
//这里其实是有问题的。先这样简单处理
codec = QTextCodec::codecForName("UTF-8");
}
else
{
codec = QTextCodec::codecForName(textCodeName.toStdString().c_str());
}
if (codec == nullptr)
{
return false;
}
out = codec->toUnicode((const char *)pText, length, &state);
if (state.invalidChars > 0) {
return false;
}
return true;
}
/* 这里其实是穷举字符串的字符编码ASNI utf8。目前只检测GBK和utf8;其它语种没有穷举
*GB2312 GBK GB18030 三种差别见https://cloud.tencent.com/developer/article/1343240
*关于编码的详细说明见https://blog.csdn.net/libaineu2004/article/details/19245205
*/
//这里是有限检查utf8的如果出现gbk说明一定不是utf8因为utf8检查到错误码。
CODE_ID Encode::CheckUnicodeWithoutBOM(const uchar* pText, int length)
{
QTextCodec::ConverterState state;
QTextCodec *codec = QTextCodec::codecForName("UTF-8");
const QString text = codec->toUnicode((const char *)pText, length, &state);
if (state.invalidChars > 0) {
/*不是UTF-8格式的文件这里优先判断是不是UTF8再判断是不是GBK我们先做中文版如果后续要做
*国际版其实不应该只检查GBK而是应该检查本地ASCI码包括ascii码*/
QTextCodec::ConverterState state1;
QTextCodec *codec1 = QTextCodec::codecForName("GBK");
codec1->toUnicode((const char *)pText, length, &state1);
if (state1.invalidChars > 0) {
return CODE_ID::ANSI;
}
else
{
return CODE_ID::GBK;
}
}
return CODE_ID::UTF8_NOBOM;
}
CODE_ID Encode::CheckUnicodeWithoutBOM(const uchar* pText, int length, QString &outUnicodeText)
{
QTextCodec::ConverterState state;
QTextCodec *codec = QTextCodec::codecForName("UTF-8");
outUnicodeText = codec->toUnicode((const char *)pText, length, &state);
if (state.invalidChars > 0) {
/*不是UTF-8格式的文件这里优先判断是不是UTF8再判断是不是GBK我们先做中文版如果后续要做
*国际版其实不应该只检查GBK而是因为检查本地ASCI码包括ascii码*/
QTextCodec::ConverterState state1;
QTextCodec *codec1 = QTextCodec::codecForName("GBK");
QString gbkStr = codec1->toUnicode((const char *)pText, length, &state1);
if (state1.invalidChars > 0) {
//如果也不是gbk姑且按照utf8直接返回
return CODE_ID::ANSI;
}
else
{
outUnicodeText = gbkStr;
return CODE_ID::GBK;
}
}
return CODE_ID::UTF8_NOBOM;
}
//检查是否全是ascii字符码
bool Encode::CheckTextIsAllAscii(const uchar* pText, int length)
{
for (int i = 0; i < length; ++i)
{
if (*(pText + i) < 0 || *(pText + i) > 0x7F)
{
return false;
}
}
return true;
}