StringProcess.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645
  1. #include "StdAfx.h"
  2. #include "StringProcess.h"
  3. #include <strsafe.h>
  4. namespace StringProcess
  5. {
  6. /************************************************************************/
  7. /*
  8. 函数: utf82unicode
  9. 描述: utf8字符串转为unicode字符串;
  10. 参数:
  11. pszutf8 utf8字符串;
  12. 返回: unicode字符串;
  13. 注意: 返回的指针所指向的内存需要释放;
  14. */
  15. /************************************************************************/
  16. WCHAR* utf8_to_unicode(IN const char *pszutf8)
  17. {
  18. int wnSize = MultiByteToWideChar(CP_UTF8, 0, pszutf8, -1, NULL, 0);
  19. if (wnSize == ERROR_NO_UNICODE_TRANSLATION)
  20. {
  21. //throw std::exception("Invalide UTF-8 sequence");
  22. return NULL;
  23. }
  24. if (wnSize == 0)
  25. {
  26. //throw std::exception("Error in conversion");
  27. return NULL;
  28. }
  29. WCHAR *pwResult = new WCHAR[wnSize];
  30. int nConvertSize = MultiByteToWideChar(CP_UTF8, 0, pszutf8, -1, pwResult, wnSize);
  31. if (nConvertSize != wnSize)
  32. {
  33. //throw std::exception("la falla");
  34. if (pwResult)
  35. delete []pwResult;
  36. return NULL;
  37. }
  38. return pwResult;
  39. }
  40. /************************************************************************/
  41. /*
  42. 函数: utf82unicode
  43. 描述: utf8字符串转为unicode字符串;
  44. 参数:
  45. pszutf8 utf8字符串;
  46. pszunicode 返回的unicode字符串;
  47. 返回: unicode字符串大小;
  48. */
  49. /************************************************************************/
  50. int utf8_to_unicode(IN const char *pszutf8, IN OUT WCHAR *pszunicode)
  51. {
  52. int wnSize = MultiByteToWideChar(CP_UTF8, 0, pszutf8, -1, NULL, 0);
  53. if (wnSize == ERROR_NO_UNICODE_TRANSLATION)
  54. {
  55. //throw std::exception("Invalide UTF-8 sequence");
  56. return 0;
  57. }
  58. if (wnSize == 0)
  59. {
  60. //throw std::exception("Error in conversion");
  61. return 0;
  62. }
  63. int nConvertSize = MultiByteToWideChar(CP_UTF8, 0, pszutf8, -1, pszunicode, wnSize);
  64. if (nConvertSize != wnSize)
  65. {
  66. //throw std::exception("la falla");
  67. return 0;
  68. }
  69. return wnSize;
  70. }
  71. /************************************************************************/
  72. /*
  73. 函数: utf82ascii
  74. 描述: 将utf8字符串转为ascii字符串;
  75. 参数:
  76. pszutf8 utf8字符串;
  77. 返回: ascii字符串;
  78. 注意: 返回的指针需要手动释放所指内存;
  79. */
  80. /************************************************************************/
  81. CHAR* utf8_to_ascii(const CHAR *pszutf8)
  82. {
  83. // 先把 utf8 转为 unicode ;
  84. WCHAR *pwstr = utf8_to_unicode(pszutf8);
  85. // 最后把 unicode 转为 ascii ;
  86. CHAR *pacsii = NULL;
  87. if (pwstr)
  88. pacsii = unicode_to_ascii(pwstr);
  89. if (pwstr)
  90. delete []pwstr;
  91. return pacsii;
  92. }
  93. /************************************************************************/
  94. /*
  95. 函数: utf82ascii
  96. 描述: 将utf8字符串转为ascii字符串;
  97. 参数:
  98. pszutf8 utf8字符串;
  99. 返回: ascii字符串;
  100. 注意: 返回的指针需要手动释放所指内存;
  101. */
  102. /************************************************************************/
  103. int utf8_to_ascii(IN const CHAR *pszutf8, IN OUT CHAR* pszacsii)
  104. {
  105. // 先把 utf8 转为 unicode ;
  106. WCHAR *pwstr = utf8_to_unicode(pszutf8);
  107. // 最后把 unicode 转为 ascii ;
  108. int nascii = 0;
  109. if (pwstr)
  110. nascii = unicode_to_ascii(pwstr, pszacsii);
  111. if (pwstr)
  112. delete []pwstr;
  113. return nascii;
  114. }
  115. /************************************************************************/
  116. /*
  117. 函数: unicode2acsii
  118. 描述: unicode字符串转为acsii字符串;
  119. 参数:
  120. pszunicode unicode字符串;
  121. 返回: acsii字符串;
  122. 注意: 返回的指针所指向的内存需要释放;
  123. */
  124. /************************************************************************/
  125. CHAR* unicode_to_ascii(IN const WCHAR *pszunicode)
  126. {
  127. int asciisize = WideCharToMultiByte(CP_OEMCP, 0, pszunicode, -1, NULL, 0, NULL, NULL);
  128. if (asciisize == ERROR_NO_UNICODE_TRANSLATION)
  129. {
  130. //throw std::exception("Invalid UTF-8 sequence.");
  131. return NULL;
  132. }
  133. if (asciisize == 0)
  134. {
  135. //throw std::exception("Error in conversion.");
  136. return NULL;
  137. }
  138. CHAR *pAscii = new CHAR[asciisize];
  139. int convresult = WideCharToMultiByte(CP_OEMCP, 0, pszunicode, -1, pAscii, asciisize, NULL, NULL);
  140. if (convresult != asciisize)
  141. {
  142. //throw std::exception("La falla!");
  143. if (pAscii) delete []pAscii;
  144. return NULL;
  145. }
  146. return pAscii;
  147. }
  148. /************************************************************************/
  149. /*
  150. 函数: unicode2acsii
  151. 描述: unicode字符串转为acsii字符串;
  152. 参数:
  153. pszunicode unicode字符串;
  154. pszacsii 返回的acsii字符串;
  155. 返回: acsii字符串大小;
  156. */
  157. /************************************************************************/
  158. int unicode_to_ascii(IN const WCHAR *pszunicode, IN OUT CHAR *pszacsii)
  159. {
  160. int asciisize = WideCharToMultiByte(CP_OEMCP, 0, pszunicode, -1, NULL, 0, NULL, NULL);
  161. if (asciisize == ERROR_NO_UNICODE_TRANSLATION)
  162. {
  163. //throw std::exception("Invalid UTF-8 sequence.");
  164. return 0;
  165. }
  166. if (asciisize == 0)
  167. {
  168. //throw std::exception("Error in conversion.");
  169. return 0;
  170. }
  171. int convresult = WideCharToMultiByte(CP_OEMCP, 0, pszunicode, -1, pszacsii, asciisize, NULL, NULL);
  172. if (convresult != asciisize)
  173. {
  174. //throw std::exception("La falla!");
  175. return 0;
  176. }
  177. return asciisize;
  178. }
  179. /************************************************************************/
  180. /*
  181. 函数: unicode2uft8
  182. 描述: 将unicode字符串转为utf8字符串;
  183. 参数:
  184. pszunicode unicode字符串;
  185. 返回: utf8字符串;
  186. 注意: 返回的指针需要手动释放所指内存;
  187. */
  188. /************************************************************************/
  189. CHAR* unicode_to_uft8(IN const WCHAR *pszunicode)
  190. {
  191. int utf8size = WideCharToMultiByte(CP_UTF8, 0, pszunicode, -1, NULL, 0, NULL, NULL);
  192. if (utf8size == 0)
  193. {
  194. //throw std::exception("Error in conversion.");
  195. return NULL;
  196. }
  197. CHAR* putf8 = new CHAR[utf8size];
  198. int convresult = WideCharToMultiByte(CP_UTF8, 0, pszunicode, -1, putf8, utf8size, NULL, NULL);
  199. if (convresult != utf8size)
  200. {
  201. //throw std::exception("La falla!");
  202. if (putf8)delete []putf8;
  203. return NULL;
  204. }
  205. return putf8;
  206. }
  207. /************************************************************************/
  208. /*
  209. 函数: unicode2uft8
  210. 描述: 将unicode字符串转为utf8字符串;
  211. 参数:
  212. pszunicode unicode字符串;
  213. pszutf8 返回的utf8字符串;
  214. 返回: utf8字符串大小;
  215. */
  216. /************************************************************************/
  217. int unicode_to_uft8(IN const WCHAR *pszunicode, IN OUT CHAR* pszutf8)
  218. {
  219. int utf8size = WideCharToMultiByte(CP_UTF8, 0, pszunicode, -1, NULL, 0, NULL, NULL);
  220. if (utf8size == 0)
  221. {
  222. //throw std::exception("Error in conversion.");
  223. return 0;
  224. }
  225. int convresult = WideCharToMultiByte(CP_UTF8, 0, pszunicode, -1, pszutf8, utf8size, NULL, NULL);
  226. if (convresult != utf8size)
  227. {
  228. //throw std::exception("La falla!");
  229. return 0;
  230. }
  231. return utf8size;
  232. }
  233. /************************************************************************/
  234. /*
  235. 函数: ascii2unicode
  236. 描述: 将ascii字符串转为unicode字符串;
  237. 参数:
  238. pszascii ascii字符串;
  239. 返回: unicode字符串;
  240. 注意: 返回的指针需要手动释放其所指的内存;
  241. */
  242. /************************************************************************/
  243. WCHAR* ascii_to_unicode(IN const CHAR* pszascii)
  244. {
  245. int wSize = MultiByteToWideChar(CP_ACP, 0, pszascii, -1, NULL, 0);
  246. if (wSize == ERROR_NO_UNICODE_TRANSLATION)
  247. {
  248. //throw std::exception("Invalid UTF-8 sequence.");
  249. return NULL;
  250. }
  251. if (wSize == 0)
  252. {
  253. //throw std::exception("Error in conversion.");
  254. return NULL;
  255. }
  256. WCHAR *punicode = new WCHAR[wSize];
  257. int convresult = MultiByteToWideChar(CP_ACP, 0, pszascii, -1, punicode, wSize);
  258. if (convresult != wSize)
  259. {
  260. //throw std::exception("La falla!");
  261. if (punicode) delete []punicode;
  262. return NULL;
  263. }
  264. return punicode;
  265. }
  266. /************************************************************************/
  267. /*
  268. 函数: ascii2unicode
  269. 描述: 将ascii字符串转为unicode字符串;
  270. 参数:
  271. pszascii ascii字符串;
  272. 返回: unicode字符串;
  273. 注意: 返回的指针需要手动释放其所指的内存;
  274. */
  275. /************************************************************************/
  276. int ascii_to_unicode(IN const CHAR* pszascii, IN OUT WCHAR *pszunicode)
  277. {
  278. int wSize = MultiByteToWideChar(CP_ACP, 0, pszascii, -1, NULL, 0);
  279. if (wSize == ERROR_NO_UNICODE_TRANSLATION)
  280. {
  281. //throw std::exception("Invalid UTF-8 sequence.");
  282. return 0;
  283. }
  284. if (wSize == 0)
  285. {
  286. //throw std::exception("Error in conversion.");
  287. return 0;
  288. }
  289. int convresult = MultiByteToWideChar(CP_ACP, 0, pszascii, -1, pszunicode, wSize);
  290. if (convresult != wSize)
  291. {
  292. //throw std::exception("La falla!");
  293. return 0;
  294. }
  295. return wSize;
  296. }
  297. /************************************************************************/
  298. /*
  299. 函数: ascii2utf8
  300. 描述: 将ascii字符串转为utf8字符串;
  301. 参数:
  302. pszascii ascii字符串;
  303. 返回: uft8字符串;
  304. 注意: 返回的指针需要手动释放其所指的内存;
  305. */
  306. /************************************************************************/
  307. CHAR* ascii_to_utf8(IN const CHAR* pszascii)
  308. {
  309. // 先把 ascii 转为 unicode ;
  310. WCHAR *pwstr = ascii_to_unicode(pszascii);
  311. // 最后把 unicode 转为 utf8 ;
  312. CHAR* putf8 = NULL;
  313. if (pwstr)
  314. putf8 = unicode_to_uft8(pwstr);
  315. if (pwstr)
  316. delete []pwstr;
  317. return putf8;
  318. }
  319. /************************************************************************/
  320. /*
  321. 函数: ascii2utf8
  322. 描述: 将ascii字符串转为utf8字符串;
  323. 参数:
  324. pszascii ascii字符串;
  325. 返回: uft8字符串;
  326. 注意: 返回的指针需要手动释放其所指的内存;
  327. */
  328. /************************************************************************/
  329. int ascii_to_utf8(IN const CHAR* pszascii, IN OUT CHAR* pszutf8)
  330. {
  331. // 先把 ascii 转为 unicode ;
  332. WCHAR *pwstr = ascii_to_unicode(pszascii);
  333. // 最后把 unicode 转为 utf8 ;
  334. int nSize = 0;
  335. if (pwstr)
  336. nSize = unicode_to_uft8(pwstr, pszutf8);
  337. if (pwstr)
  338. delete []pwstr;
  339. return nSize;
  340. }
  341. //////////////////////////////////////////////////////////////////////////
  342. void Gb2312ToUnicode(WCHAR* pOut, char *gbBuffer)
  343. {
  344. ::MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, gbBuffer, 2, pOut, 1);
  345. return;
  346. }
  347. //-------------------------------------------------------------
  348. // 函数:UTF_8ToUnicode
  349. // 描述:把3个uft-8中文字符转为单个unicode字符;
  350. // 参数:
  351. // pOut[IN OUT]: 返回的unicode字符;
  352. // pText[IN]: uft-8字符;
  353. // 返回:null;
  354. //-------------------------------------------------------------
  355. void UTF_8ToUnicode(WCHAR* pOut, char *pText)
  356. {
  357. char* uchar = (char *)pOut;
  358. uchar[1] = ((pText[0] & 0x0F) << 4) + ((pText[1] >> 2) & 0x0F);
  359. uchar[0] = ((pText[1] & 0x03) << 6) + (pText[2] & 0x3F);
  360. return;
  361. }
  362. //-------------------------------------------------------------
  363. // 函数:UnicodeToUTF_8
  364. // 描述:把单个unicode字符转为3个uft-8中文字符;
  365. // 参数:
  366. // pOut[IN OUT]: 返回的unicode字符;
  367. // pText[IN]: uft-8字符;
  368. // 返回:null;
  369. //-------------------------------------------------------------
  370. void UnicodeToUTF_8(char* pOut, WCHAR* pText)
  371. {
  372. // 注意 WCHAR高低字的顺序,低字节在前,高字节在后
  373. char* pchar = (char *)pText;
  374. pOut[0] = (0xE0 | ((pchar[1] & 0xF0) >> 4));
  375. pOut[1] = (0x80 | ((pchar[1] & 0x0F) << 2)) + ((pchar[0] & 0xC0) >> 6);
  376. pOut[2] = (0x80 | (pchar[0] & 0x3F));
  377. return;
  378. }
  379. void UnicodeToGB2312(char* pOut, WCHAR uData)
  380. {
  381. WideCharToMultiByte(CP_ACP, NULL, &uData, 1, pOut, sizeof(WCHAR), NULL, NULL);
  382. return;
  383. }
  384. //做为解Url使用
  385. char CharToInt(char ch)
  386. {
  387. if (ch >= '0' && ch <= '9')return (char)(ch - '0');
  388. if (ch >= 'a' && ch <= 'f')return (char)(ch - 'a' + 10);
  389. if (ch >= 'A' && ch <= 'F')return (char)(ch - 'A' + 10);
  390. return -1;
  391. }
  392. char StrToBin(char *str)
  393. {
  394. char tempWord[2];
  395. char chn;
  396. tempWord[0] = CharToInt(str[0]); //make the B to 11 -- 00001011
  397. tempWord[1] = CharToInt(str[1]); //make the 0 to 0 -- 00000000
  398. chn = (tempWord[0] << 4) | tempWord[1]; //to change the BO to 10110000
  399. return chn;
  400. }
  401. //UTF_8 转gb2312
  402. void UTF_8ToGB2312(string &pOut, char *pText, int pLen)
  403. {
  404. char buf[4];
  405. char* rst = new char[pLen + (pLen >> 2) + 2];
  406. memset(buf, 0, 4);
  407. memset(rst, 0, pLen + (pLen >> 2) + 2);
  408. int i = 0;
  409. int j = 0;
  410. while (i < pLen)
  411. {
  412. if (*(pText + i) >= 0)
  413. {
  414. rst[j++] = pText[i++];
  415. }
  416. else
  417. {
  418. WCHAR Wtemp;
  419. UTF_8ToUnicode(&Wtemp, pText + i);
  420. UnicodeToGB2312(buf, Wtemp);
  421. unsigned short int tmp = 0;
  422. tmp = rst[j] = buf[0];
  423. tmp = rst[j + 1] = buf[1];
  424. tmp = rst[j + 2] = buf[2];
  425. //newBuf[j] = Ctemp[0];
  426. //newBuf[j + 1] = Ctemp[1];
  427. i += 3;
  428. j += 2;
  429. }
  430. }
  431. if (i == 0)
  432. rst[j] = '/0';
  433. pOut = rst;
  434. delete[]rst;
  435. }
  436. //GB2312 转为 UTF-8
  437. void GB2312ToUTF_8(string& pOut, char *pText, int pLen)
  438. {
  439. char buf[4];
  440. memset(buf, 0, 4);
  441. pOut.clear();
  442. int i = 0;
  443. while (i < pLen)
  444. {
  445. //如果是英文直接复制就可以
  446. if (pText[i] >= 0)
  447. {
  448. char asciistr[2] = { 0 };
  449. asciistr[0] = (pText[i++]);
  450. pOut.append(asciistr);
  451. }
  452. else
  453. {
  454. WCHAR pbuffer;
  455. Gb2312ToUnicode(&pbuffer, pText + i);
  456. UnicodeToUTF_8(buf, &pbuffer);
  457. pOut.append(buf);
  458. i += 2;
  459. }
  460. }
  461. return;
  462. }
  463. //把str编码为网页中的 GB2312 url encode ,英文不变,汉字双字节 如%3D%AE%88
  464. string UrlGB2312(char * str)
  465. {
  466. string dd;
  467. size_t len = strlen(str);
  468. for (size_t i = 0; i < len; i++)
  469. {
  470. if (isalnum((BYTE)str[i]))
  471. {
  472. char tempbuff[2];
  473. //sprintf(tempbuff,"%c",str[i]);
  474. //sprintf_s(tempbuff, "%c", str[i]);
  475. StringCchPrintfA(tempbuff, 2, "%s",str[i]);
  476. dd.append(tempbuff);
  477. }
  478. else if (isspace((BYTE)str[i]))
  479. {
  480. dd.append("+");
  481. }
  482. else
  483. {
  484. char tempbuff[4];
  485. //sprintf(tempbuff,"%%%X%X",((BYTE*)str)[i] >>4,((BYTE*)str)[i] %16);
  486. //sprintf_s(tempbuff, "%%%X%X", ((BYTE*)str)[i] >> 4, ((BYTE*)str)[i] % 16);
  487. StringCchPrintfA(tempbuff, 4, "%%%X%X", ((BYTE*)str)[i] >> 4, ((BYTE*)str)[i] % 16);
  488. dd.append(tempbuff);
  489. }
  490. }
  491. return dd;
  492. }
  493. //把str编码为网页中的 UTF-8 url encode ,英文不变,汉字三字节 如%3D%AE%88
  494. string UrlUTF8(char * str)
  495. {
  496. string tt;
  497. string dd;
  498. GB2312ToUTF_8(tt, str, (int)strlen(str));
  499. size_t len = tt.length();
  500. for (size_t i = 0; i < len; i++)
  501. {
  502. if (isalnum((BYTE)tt.at(i)))
  503. {
  504. char tempbuff[2] = { 0 };
  505. //sprintf(tempbuff,"%c",(BYTE)tt.at(i));
  506. //sprintf_s(tempbuff, "%c", (BYTE)tt.at(i));
  507. StringCchPrintfA(tempbuff, 2, "%c", (BYTE)tt.at(i));
  508. dd.append(tempbuff);
  509. }
  510. else if (isspace((BYTE)tt.at(i)))
  511. {
  512. dd.append("+");
  513. }
  514. else
  515. {
  516. char tempbuff[4];
  517. //sprintf(tempbuff,"%%%X%X",((BYTE)tt.at(i)) >>4,((BYTE)tt.at(i)) %16);
  518. //sprintf_s(tempbuff, "%%%X%X", ((BYTE)tt.at(i)) >> 4, ((BYTE)tt.at(i)) % 16);
  519. StringCchPrintfA(tempbuff, 4, "%%%X%X", ((BYTE)tt.at(i)) >> 4, ((BYTE)tt.at(i)) % 16);
  520. dd.append(tempbuff);
  521. }
  522. }
  523. return dd;
  524. }
  525. //把url GB2312解码
  526. string UrlGB2312Decode(string str)
  527. {
  528. string output = "";
  529. char tmp[2];
  530. int i = 0, idx = 0, len = str.length();
  531. while (i < len){
  532. if (str[i] == '%')
  533. {
  534. tmp[0] = str[i + 1];
  535. tmp[1] = str[i + 2];
  536. output += StrToBin(tmp);
  537. i = i + 3;
  538. }
  539. else if (str[i] == '+')
  540. {
  541. output += ' ';
  542. i++;
  543. }
  544. else{
  545. output += str[i];
  546. i++;
  547. }
  548. }
  549. return output;
  550. }
  551. //把url utf8解码
  552. string UrlUTF8Decode(string str)
  553. {
  554. string output = "";
  555. string temp = UrlGB2312Decode(str);//
  556. UTF_8ToGB2312(output, (char *)temp.data(), strlen(temp.data()));
  557. return output;
  558. }
  559. }