关键词搜索

源码搜索 ×
×

tars源码分析之10

发布2022-07-03浏览544次

详情内容

gbk和utf8是你永远绕不开的话题,怎么转化呢?

而且,我估计你还踩过坑。反正我踩过。

其实很简单,一起来看看代码:

  1. #include <iconv.h>
  2. #include <errno.h>
  3. #include <string.h>
  4. #include "util/tc_encoder.h"
  5. namespace tars
  6. {
  7. void TC_Encoder::gbk2utf8(char *sOut, int &iMaxOutLen, const char *sIn, int iInLen)
  8. {
  9. char * pIn = (char*)sIn;
  10. char * pEnd = pIn+iInLen;
  11. char * pOut = sOut;
  12. size_t iLeftLen;
  13. size_t iGbkLen;
  14. iconv_t cd;
  15. if (iInLen > iMaxOutLen)
  16. {
  17. throw TC_Encoder_Exception("[TC_Encoder::gbk2utf8] iInLen > iMaxOutLen error : ", errno);
  18. }
  19. cd = iconv_open("UTF-8","GBK");
  20. if (cd == (iconv_t)-1)
  21. {
  22. throw TC_Encoder_Exception("[TC_Encoder::gbk2utf8] iconv_open error : ", errno);
  23. }
  24. iLeftLen = iMaxOutLen;
  25. while(pIn < pEnd)
  26. {
  27. if((unsigned char)(*pIn)==0x80)
  28. {
  29. //注意GBK的0x80转换为UTF-8时为E2 82 AC
  30. *pOut = 0xe2; pOut++; iLeftLen--;
  31. *pOut = 0x82; pOut++; iLeftLen--;
  32. *pOut = 0xac; pOut++; iLeftLen--;
  33. pIn++;
  34. }
  35. else if((unsigned char)(*pIn)<0x80)
  36. {
  37. //单字节(GBK: 0x00-0x7F)
  38. *pOut = *pIn;
  39. pIn++;pOut++;iLeftLen--;
  40. }
  41. else
  42. {
  43. //双字节
  44. iGbkLen=2;
  45. int iRet=iconv(cd, &pIn, (size_t *)&iGbkLen, (char **)&pOut, (size_t *)&iLeftLen);
  46. if(iRet < 0)
  47. {
  48. *pOut = ' '; //转换不了替换为空格
  49. pIn+=2; pOut++; iLeftLen--;
  50. }
  51. }
  52. }
  53. iconv_close(cd);
  54. sOut[iMaxOutLen - iLeftLen] = '\0';
  55. iMaxOutLen = iMaxOutLen - iLeftLen;
  56. }
  57. string TC_Encoder::gbk2utf8(const string &sIn)
  58. {
  59. iconv_t cd;
  60. cd = iconv_open("UTF-8","GBK");
  61. if (cd == (iconv_t)-1)
  62. {
  63. throw TC_Encoder_Exception("[TC_Encoder::gbk2utf8] iconv_open error", errno);
  64. }
  65. string sOut;
  66. for(string::size_type pos = 0; pos < sIn.length(); ++pos)
  67. {
  68. if((unsigned char)sIn[pos] == 0x80)
  69. {
  70. //注意GBK的0x80转换为UTF-8时为E2 82 AC
  71. sOut += 0xe2;
  72. sOut += 0x82;
  73. sOut += 0xac;
  74. }
  75. else if((unsigned char)sIn[pos] < 0x80)
  76. {
  77. //单字节(GBK: 0x00-0x7F)
  78. sOut += sIn[pos];
  79. }
  80. else
  81. {
  82. //双字节
  83. size_t sizeGbkLen = 2;
  84. char pIn[128] = "\0";
  85. strncpy(pIn, sIn.c_str() + pos, sizeGbkLen);
  86. char *p = pIn;
  87. size_t sizeLeftLen = 128;
  88. char pOut[128] = "\0";
  89. char *o = pOut;
  90. int iRet = iconv(cd, &p, &sizeGbkLen, (char **)&o, &sizeLeftLen);
  91. if(iRet < 0)
  92. {
  93. //转换不了, 暂时替换为空格
  94. sOut += ' ';
  95. }
  96. else
  97. {
  98. sOut += pOut;
  99. }
  100. ++pos;
  101. }
  102. }
  103. iconv_close(cd);
  104. return sOut;
  105. }
  106. void TC_Encoder::gbk2utf8(const string &sIn, vector<string> &vtStr)
  107. {
  108. iconv_t cd;
  109. cd = iconv_open("UTF-8","GBK");
  110. if (cd == (iconv_t)-1)
  111. {
  112. throw TC_Encoder_Exception("[TC_Encoder::gbk2utf8] iconv_open error", errno);
  113. }
  114. vtStr.clear();
  115. for(string::size_type pos = 0; pos < sIn.length(); ++pos)
  116. {
  117. string sOut;
  118. if((unsigned char)sIn[pos] == 0x80)
  119. {
  120. //注意GBK的0x80转换为UTF-8时为E2 82 AC
  121. sOut += 0xe2;
  122. sOut += 0x82;
  123. sOut += 0xac;
  124. }
  125. else if((unsigned char)sIn[pos] < 0x80)
  126. {
  127. //单字节(GBK: 0x00-0x7F)
  128. sOut += sIn[pos];
  129. }
  130. else
  131. {
  132. //双字节
  133. size_t iGbkLen = 2;
  134. char pIn[128] = "\0";
  135. strncpy(pIn, sIn.c_str() + pos, iGbkLen);
  136. char *p = pIn;
  137. size_t iLeftLen = 128;
  138. char pOut[128] = "\0";
  139. char *o = pOut;
  140. int iRet = iconv(cd, &p, (size_t *)&iGbkLen, (char **)&o, (size_t *)&iLeftLen);
  141. if(iRet < 0)
  142. {
  143. //转换不了, 暂时替换为空格
  144. sOut += ' ';
  145. }
  146. else
  147. {
  148. sOut += pOut;
  149. }
  150. ++pos;
  151. }
  152. vtStr.push_back(sOut);
  153. }
  154. iconv_close(cd);
  155. }
  156. void TC_Encoder::utf82gbk(char *sOut, int &iMaxOutLen, const char *sIn, int iInLen)
  157. {
  158. iconv_t cd;
  159. cd = iconv_open("GBK","UTF-8");
  160. if (cd == (iconv_t)-1)
  161. {
  162. throw TC_Encoder_Exception("[TC_Encoder::utf82gbk] iconv_open error", errno);
  163. }
  164. char * pIn = (char*)sIn;
  165. size_t sizeLeftLen = iMaxOutLen;
  166. size_t sizeInLen = iInLen;
  167. char* pOut = sOut;
  168. size_t ret = iconv(cd, &pIn, &sizeInLen, (char **)&sOut, &sizeLeftLen);
  169. if (ret == (size_t) - 1)
  170. {
  171. iMaxOutLen = 0;
  172. iconv_close(cd);
  173. throw TC_Encoder_Exception("[TC_Encoder::utf82gbk] iconv error", errno);
  174. return;
  175. }
  176. iconv_close(cd);
  177. pOut[iMaxOutLen - (int)sizeLeftLen] = '\0';
  178. iMaxOutLen = iMaxOutLen - (int)sizeLeftLen;
  179. }
  180. string TC_Encoder::utf82gbk(const string &sIn)
  181. {
  182. if(sIn.length() == 0)
  183. {
  184. return "";
  185. }
  186. string sOut;
  187. int iLen = sIn.length() * 2 + 1;
  188. char *pOut = new char[iLen];
  189. try
  190. {
  191. utf82gbk(pOut, iLen, sIn.c_str(), sIn.length());
  192. }
  193. catch (TC_Encoder_Exception& e)
  194. {
  195. delete[] pOut;
  196. throw e;
  197. }
  198. sOut.assign(pOut, iLen);
  199. delete[] pOut;
  200. return sOut;
  201. }
  202. /**
  203. * \n -> \r\0
  204. * \r -> \r\r
  205. */
  206. string TC_Encoder::transTo(const string& str, char f /*='\n'*/, char t /*= '\r'*/, char u /*= '\0'*/)
  207. {
  208. string ret = str;
  209. for (size_t i = 0; i < ret.length(); ++i)
  210. {
  211. if (ret[i] == f)
  212. {
  213. ret[i] = t;
  214. ret.insert(++i, 1, u);
  215. }
  216. else if (ret[i] == t)
  217. {
  218. ret.insert(++i, 1, t);
  219. }
  220. }
  221. return ret;
  222. }
  223. /**
  224. * \r\0 -> \n
  225. * \r\r -> \r
  226. */
  227. string TC_Encoder::transFrom(const string& str, char f /*= '\n'*/, char t /*= '\r'*/, char u /*= '\0'*/)
  228. {
  229. string ret = "";
  230. for (string::const_iterator it = str.begin()
  231. ; it != str.end()
  232. ; ++it)
  233. {
  234. ret.append(1, *it);
  235. if (*it == t)
  236. {
  237. if (*(++it) == u)
  238. {
  239. *ret.rbegin() = f;
  240. }
  241. }
  242. }
  243. return ret;
  244. }
  245. }

相关技术文章

点击QQ咨询
开通会员
返回顶部
×
微信扫码支付
微信扫码支付
确定支付下载
请使用微信描二维码支付
×

提示信息

×

选择支付方式

  • 微信支付
  • 支付宝付款
确定支付下载