StringProcess.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645
  1. #include "StdAfx.h"
  2. #include "StringProcess.h"
  3. namespace StringProcess
  4. {
  5. /************************************************************************/
  6. /*
  7. 函数: utf82unicode
  8. 描述: utf8字符串转为unicode字符串;
  9. 参数:
  10. pszutf8 utf8字符串;
  11. 返回: unicode字符串;
  12. 注意: 返回的指针所指向的内存需要释放;
  13. */
  14. /************************************************************************/
  15. WCHAR* utf82unicode(IN const char *pszutf8)
  16. {
  17. int wnSize = MultiByteToWideChar(CP_UTF8, 0, pszutf8, -1, NULL, 0);
  18. if (wnSize == ERROR_NO_UNICODE_TRANSLATION)
  19. {
  20. //throw std::exception("Invalide UTF-8 sequence");
  21. return NULL;
  22. }
  23. if (wnSize == 0)
  24. {
  25. //throw std::exception("Error in conversion");
  26. return NULL;
  27. }
  28. WCHAR *pwResult = new WCHAR[wnSize];
  29. int nConvertSize = MultiByteToWideChar(CP_UTF8, 0, pszutf8, -1, pwResult, wnSize);
  30. if (nConvertSize != wnSize)
  31. {
  32. //throw std::exception("la falla");
  33. if (pwResult)
  34. delete []pwResult;
  35. return NULL;
  36. }
  37. return pwResult;
  38. }
  39. /************************************************************************/
  40. /*
  41. 函数: utf82unicode
  42. 描述: utf8字符串转为unicode字符串;
  43. 参数:
  44. pszutf8 utf8字符串;
  45. pszunicode 返回的unicode字符串;
  46. 返回: unicode字符串大小;
  47. */
  48. /************************************************************************/
  49. int utf82unicode(IN const char *pszutf8, IN OUT WCHAR *pszunicode)
  50. {
  51. int wnSize = MultiByteToWideChar(CP_UTF8, 0, pszutf8, -1, NULL, 0);
  52. if (wnSize == ERROR_NO_UNICODE_TRANSLATION)
  53. {
  54. //throw std::exception("Invalide UTF-8 sequence");
  55. return 0;
  56. }
  57. if (wnSize == 0)
  58. {
  59. //throw std::exception("Error in conversion");
  60. return 0;
  61. }
  62. int nConvertSize = MultiByteToWideChar(CP_UTF8, 0, pszutf8, -1, pszunicode, wnSize);
  63. if (nConvertSize != wnSize)
  64. {
  65. //throw std::exception("la falla");
  66. return 0;
  67. }
  68. return wnSize;
  69. }
  70. /************************************************************************/
  71. /*
  72. 函数: unicode2acsii
  73. 描述: unicode字符串转为acsii字符串;
  74. 参数:
  75. pszunicode unicode字符串;
  76. 返回: acsii字符串;
  77. 注意: 返回的指针所指向的内存需要释放;
  78. */
  79. /************************************************************************/
  80. CHAR* unicode2acsii(IN const WCHAR *pszunicode)
  81. {
  82. int asciisize = WideCharToMultiByte(CP_OEMCP, 0, pszunicode, -1, NULL, 0, NULL, NULL);
  83. if (asciisize == ERROR_NO_UNICODE_TRANSLATION)
  84. {
  85. //throw std::exception("Invalid UTF-8 sequence.");
  86. return NULL;
  87. }
  88. if (asciisize == 0)
  89. {
  90. //throw std::exception("Error in conversion.");
  91. return NULL;
  92. }
  93. CHAR *pAscii = new CHAR[asciisize];
  94. int convresult = WideCharToMultiByte(CP_OEMCP, 0, pszunicode, -1, pAscii, asciisize, NULL, NULL);
  95. if (convresult != asciisize)
  96. {
  97. //throw std::exception("La falla!");
  98. if (pAscii) delete []pAscii;
  99. return NULL;
  100. }
  101. return pAscii;
  102. }
  103. /************************************************************************/
  104. /*
  105. 函数: unicode2acsii
  106. 描述: unicode字符串转为acsii字符串;
  107. 参数:
  108. pszunicode unicode字符串;
  109. pszacsii 返回的acsii字符串;
  110. 返回: acsii字符串大小;
  111. */
  112. /************************************************************************/
  113. int unicode2acsii(IN const WCHAR *pszunicode, IN OUT CHAR *pszacsii)
  114. {
  115. int asciisize = WideCharToMultiByte(CP_OEMCP, 0, pszunicode, -1, NULL, 0, NULL, NULL);
  116. if (asciisize == ERROR_NO_UNICODE_TRANSLATION)
  117. {
  118. //throw std::exception("Invalid UTF-8 sequence.");
  119. return 0;
  120. }
  121. if (asciisize == 0)
  122. {
  123. //throw std::exception("Error in conversion.");
  124. return 0;
  125. }
  126. int convresult = WideCharToMultiByte(CP_OEMCP, 0, pszunicode, -1, pszacsii, asciisize, NULL, NULL);
  127. if (convresult != asciisize)
  128. {
  129. //throw std::exception("La falla!");
  130. return 0;
  131. }
  132. return asciisize;
  133. }
  134. /************************************************************************/
  135. /*
  136. 函数: utf82ascii
  137. 描述: 将utf8字符串转为ascii字符串;
  138. 参数:
  139. pszutf8 utf8字符串;
  140. 返回: ascii字符串;
  141. 注意: 返回的指针需要手动释放所指内存;
  142. */
  143. /************************************************************************/
  144. CHAR* utf82ascii(const CHAR *pszutf8)
  145. {
  146. // 先把 utf8 转为 unicode ;
  147. WCHAR *pwstr = utf82unicode(pszutf8);
  148. // 最后把 unicode 转为 ascii ;
  149. CHAR *pacsii = NULL;
  150. if (pwstr)
  151. pacsii = unicode2acsii(pwstr);
  152. if (pwstr)
  153. delete []pwstr;
  154. return pacsii;
  155. }
  156. /************************************************************************/
  157. /*
  158. 函数: utf82ascii
  159. 描述: 将utf8字符串转为ascii字符串;
  160. 参数:
  161. pszutf8 utf8字符串;
  162. 返回: ascii字符串;
  163. 注意: 返回的指针需要手动释放所指内存;
  164. */
  165. /************************************************************************/
  166. int utf82ascii(IN const CHAR *pszutf8, IN OUT CHAR* pszacsii)
  167. {
  168. // 先把 utf8 转为 unicode ;
  169. WCHAR *pwstr = utf82unicode(pszutf8);
  170. // 最后把 unicode 转为 ascii ;
  171. int nascii = 0;
  172. if (pwstr)
  173. nascii = unicode2acsii(pwstr, pszacsii);
  174. if (pwstr)
  175. delete []pwstr;
  176. return nascii;
  177. }
  178. /************************************************************************/
  179. /*
  180. 函数: unicode2uft8
  181. 描述: 将unicode字符串转为utf8字符串;
  182. 参数:
  183. pszunicode unicode字符串;
  184. 返回: utf8字符串;
  185. 注意: 返回的指针需要手动释放所指内存;
  186. */
  187. /************************************************************************/
  188. CHAR* unicode2uft8(IN const WCHAR *pszunicode)
  189. {
  190. int utf8size = WideCharToMultiByte(CP_UTF8, 0, pszunicode, -1, NULL, 0, NULL, NULL);
  191. if (utf8size == 0)
  192. {
  193. //throw std::exception("Error in conversion.");
  194. return NULL;
  195. }
  196. CHAR* putf8 = new CHAR[utf8size];
  197. int convresult = WideCharToMultiByte(CP_UTF8, 0, pszunicode, -1, putf8, utf8size, NULL, NULL);
  198. if (convresult != utf8size)
  199. {
  200. //throw std::exception("La falla!");
  201. if (putf8)delete []putf8;
  202. return NULL;
  203. }
  204. return putf8;
  205. }
  206. /************************************************************************/
  207. /*
  208. 函数: unicode2uft8
  209. 描述: 将unicode字符串转为utf8字符串;
  210. 参数:
  211. pszunicode unicode字符串;
  212. pszutf8 返回的utf8字符串;
  213. 返回: utf8字符串大小;
  214. */
  215. /************************************************************************/
  216. int unicode2uft8(IN const WCHAR *pszunicode, IN OUT CHAR* pszutf8)
  217. {
  218. int utf8size = WideCharToMultiByte(CP_UTF8, 0, pszunicode, -1, NULL, 0, NULL, NULL);
  219. if (utf8size == 0)
  220. {
  221. //throw std::exception("Error in conversion.");
  222. return 0;
  223. }
  224. int convresult = WideCharToMultiByte(CP_UTF8, 0, pszunicode, -1, pszutf8, utf8size, NULL, NULL);
  225. if (convresult != utf8size)
  226. {
  227. //throw std::exception("La falla!");
  228. return 0;
  229. }
  230. return utf8size;
  231. }
  232. /************************************************************************/
  233. /*
  234. 函数: ascii2unicode
  235. 描述: 将ascii字符串转为unicode字符串;
  236. 参数:
  237. pszascii ascii字符串;
  238. 返回: unicode字符串;
  239. 注意: 返回的指针需要手动释放其所指的内存;
  240. */
  241. /************************************************************************/
  242. WCHAR* ascii2unicode(IN const CHAR* pszascii)
  243. {
  244. int wSize = MultiByteToWideChar(CP_ACP, 0, pszascii, -1, NULL, 0);
  245. if (wSize == ERROR_NO_UNICODE_TRANSLATION)
  246. {
  247. //throw std::exception("Invalid UTF-8 sequence.");
  248. return NULL;
  249. }
  250. if (wSize == 0)
  251. {
  252. //throw std::exception("Error in conversion.");
  253. return NULL;
  254. }
  255. WCHAR *punicode = new WCHAR[wSize];
  256. int convresult = MultiByteToWideChar(CP_ACP, 0, pszascii, -1, punicode, wSize);
  257. if (convresult != wSize)
  258. {
  259. //throw std::exception("La falla!");
  260. if (punicode) delete []punicode;
  261. return NULL;
  262. }
  263. return punicode;
  264. }
  265. /************************************************************************/
  266. /*
  267. 函数: ascii2unicode
  268. 描述: 将ascii字符串转为unicode字符串;
  269. 参数:
  270. pszascii ascii字符串;
  271. 返回: unicode字符串;
  272. 注意: 返回的指针需要手动释放其所指的内存;
  273. */
  274. /************************************************************************/
  275. int ascii2unicode(IN const CHAR* pszascii, IN OUT WCHAR *pszunicode)
  276. {
  277. int wSize = MultiByteToWideChar(CP_ACP, 0, pszascii, -1, NULL, 0);
  278. if (wSize == ERROR_NO_UNICODE_TRANSLATION)
  279. {
  280. //throw std::exception("Invalid UTF-8 sequence.");
  281. return 0;
  282. }
  283. if (wSize == 0)
  284. {
  285. //throw std::exception("Error in conversion.");
  286. return 0;
  287. }
  288. int convresult = MultiByteToWideChar(CP_ACP, 0, pszascii, -1, pszunicode, wSize);
  289. if (convresult != wSize)
  290. {
  291. //throw std::exception("La falla!");
  292. return 0;
  293. }
  294. return wSize;
  295. }
  296. /************************************************************************/
  297. /*
  298. 函数: ascii2utf8
  299. 描述: 将ascii字符串转为utf8字符串;
  300. 参数:
  301. pszascii ascii字符串;
  302. 返回: uft8字符串;
  303. 注意: 返回的指针需要手动释放其所指的内存;
  304. */
  305. /************************************************************************/
  306. CHAR* ascii2utf8(IN const CHAR* pszascii)
  307. {
  308. // 先把 ascii 转为 unicode ;
  309. WCHAR *pwstr = ascii2unicode(pszascii);
  310. // 最后把 unicode 转为 utf8 ;
  311. CHAR* putf8 = NULL;
  312. if (pwstr)
  313. putf8 = unicode2uft8(pwstr);
  314. if (pwstr)
  315. delete []pwstr;
  316. return putf8;
  317. }
  318. /************************************************************************/
  319. /*
  320. 函数: ascii2utf8
  321. 描述: 将ascii字符串转为utf8字符串;
  322. 参数:
  323. pszascii ascii字符串;
  324. 返回: uft8字符串;
  325. 注意: 返回的指针需要手动释放其所指的内存;
  326. */
  327. /************************************************************************/
  328. int ascii2utf8(IN const CHAR* pszascii, IN OUT CHAR* pszutf8)
  329. {
  330. // 先把 ascii 转为 unicode ;
  331. WCHAR *pwstr = ascii2unicode(pszascii);
  332. // 最后把 unicode 转为 utf8 ;
  333. int nSize = 0;
  334. if (pwstr)
  335. nSize = unicode2uft8(pwstr, pszutf8);
  336. if (pwstr)
  337. delete []pwstr;
  338. return nSize;
  339. }
  340. //////////////////////////////////////////////////////////////////////////
  341. void Gb2312ToUnicode(WCHAR* pOut, char *gbBuffer)
  342. {
  343. ::MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, gbBuffer, 2, pOut, 1);
  344. return;
  345. }
  346. //-------------------------------------------------------------
  347. // 函数:UTF_8ToUnicode
  348. // 描述:把3个uft-8中文字符转为单个unicode字符;
  349. // 参数:
  350. // pOut[IN OUT]: 返回的unicode字符;
  351. // pText[IN]: uft-8字符;
  352. // 返回:null;
  353. //-------------------------------------------------------------
  354. void UTF_8ToUnicode(WCHAR* pOut, char *pText)
  355. {
  356. char* uchar = (char *)pOut;
  357. uchar[1] = ((pText[0] & 0x0F) << 4) + ((pText[1] >> 2) & 0x0F);
  358. uchar[0] = ((pText[1] & 0x03) << 6) + (pText[2] & 0x3F);
  359. return;
  360. }
  361. //-------------------------------------------------------------
  362. // 函数:UnicodeToUTF_8
  363. // 描述:把单个unicode字符转为3个uft-8中文字符;
  364. // 参数:
  365. // pOut[IN OUT]: 返回的unicode字符;
  366. // pText[IN]: uft-8字符;
  367. // 返回:null;
  368. //-------------------------------------------------------------
  369. void UnicodeToUTF_8(char* pOut, WCHAR* pText)
  370. {
  371. // 注意 WCHAR高低字的顺序,低字节在前,高字节在后
  372. char* pchar = (char *)pText;
  373. pOut[0] = (0xE0 | ((pchar[1] & 0xF0) >> 4));
  374. pOut[1] = (0x80 | ((pchar[1] & 0x0F) << 2)) + ((pchar[0] & 0xC0) >> 6);
  375. pOut[2] = (0x80 | (pchar[0] & 0x3F));
  376. return;
  377. }
  378. void UnicodeToGB2312(char* pOut, WCHAR uData)
  379. {
  380. WideCharToMultiByte(CP_ACP, NULL, &uData, 1, pOut, sizeof(WCHAR), NULL, NULL);
  381. return;
  382. }
  383. //做为解Url使用
  384. char CharToInt(char ch)
  385. {
  386. if (ch >= '0' && ch <= '9')return (char)(ch - '0');
  387. if (ch >= 'a' && ch <= 'f')return (char)(ch - 'a' + 10);
  388. if (ch >= 'A' && ch <= 'F')return (char)(ch - 'A' + 10);
  389. return -1;
  390. }
  391. char StrToBin(char *str)
  392. {
  393. char tempWord[2];
  394. char chn;
  395. tempWord[0] = CharToInt(str[0]); //make the B to 11 -- 00001011
  396. tempWord[1] = CharToInt(str[1]); //make the 0 to 0 -- 00000000
  397. chn = (tempWord[0] << 4) | tempWord[1]; //to change the BO to 10110000
  398. return chn;
  399. }
  400. //UTF_8 转gb2312
  401. void UTF_8ToGB2312(string &pOut, char *pText, int pLen)
  402. {
  403. char buf[4];
  404. char* rst = new char[pLen + (pLen >> 2) + 2];
  405. memset(buf, 0, 4);
  406. memset(rst, 0, pLen + (pLen >> 2) + 2);
  407. int i = 0;
  408. int j = 0;
  409. while (i < pLen)
  410. {
  411. if (*(pText + i) >= 0)
  412. {
  413. rst[j++] = pText[i++];
  414. }
  415. else
  416. {
  417. WCHAR Wtemp;
  418. UTF_8ToUnicode(&Wtemp, pText + i);
  419. UnicodeToGB2312(buf, Wtemp);
  420. unsigned short int tmp = 0;
  421. tmp = rst[j] = buf[0];
  422. tmp = rst[j + 1] = buf[1];
  423. tmp = rst[j + 2] = buf[2];
  424. //newBuf[j] = Ctemp[0];
  425. //newBuf[j + 1] = Ctemp[1];
  426. i += 3;
  427. j += 2;
  428. }
  429. }
  430. if (i == 0)
  431. rst[j] = '/0';
  432. pOut = rst;
  433. delete[]rst;
  434. }
  435. //GB2312 转为 UTF-8
  436. void GB2312ToUTF_8(string& pOut, char *pText, int pLen)
  437. {
  438. char buf[4];
  439. memset(buf, 0, 4);
  440. pOut.clear();
  441. int i = 0;
  442. while (i < pLen)
  443. {
  444. //如果是英文直接复制就可以
  445. if (pText[i] >= 0)
  446. {
  447. char asciistr[2] = { 0 };
  448. asciistr[0] = (pText[i++]);
  449. pOut.append(asciistr);
  450. }
  451. else
  452. {
  453. WCHAR pbuffer;
  454. Gb2312ToUnicode(&pbuffer, pText + i);
  455. UnicodeToUTF_8(buf, &pbuffer);
  456. pOut.append(buf);
  457. i += 2;
  458. }
  459. }
  460. return;
  461. }
  462. //把str编码为网页中的 GB2312 url encode ,英文不变,汉字双字节 如%3D%AE%88
  463. string UrlGB2312(char * str)
  464. {
  465. string dd;
  466. size_t len = strlen(str);
  467. for (size_t i = 0; i < len; i++)
  468. {
  469. if (isalnum((BYTE)str[i]))
  470. {
  471. char tempbuff[2];
  472. //sprintf(tempbuff,"%c",str[i]);
  473. sprintf_s(tempbuff, 2, "%c", str[i]);
  474. //StringCchPrintfA(tempbuff, 2, "%s",str[i]);
  475. dd.append(tempbuff);
  476. }
  477. else if (isspace((BYTE)str[i]))
  478. {
  479. dd.append("+");
  480. }
  481. else
  482. {
  483. char tempbuff[4];
  484. //sprintf(tempbuff,"%%%X%X",((BYTE*)str)[i] >>4,((BYTE*)str)[i] %16);
  485. sprintf_s(tempbuff, 4, "%%%X%X", ((BYTE*)str)[i] >> 4, ((BYTE*)str)[i] % 16);
  486. //StringCchPrintfA(tempbuff, 4, "%%%X%X", ((BYTE*)str)[i] >> 4, ((BYTE*)str)[i] % 16);
  487. dd.append(tempbuff);
  488. }
  489. }
  490. return dd;
  491. }
  492. //把str编码为网页中的 UTF-8 url encode ,英文不变,汉字三字节 如%3D%AE%88
  493. string UrlUTF8(char * str)
  494. {
  495. string tt;
  496. string dd;
  497. GB2312ToUTF_8(tt, str, (int)strlen(str));
  498. size_t len = tt.length();
  499. for (size_t i = 0; i < len; i++)
  500. {
  501. if (isalnum((BYTE)tt.at(i)))
  502. {
  503. char tempbuff[2] = { 0 };
  504. //sprintf(tempbuff,"%c",(BYTE)tt.at(i));
  505. sprintf_s(tempbuff, 2, "%c", (BYTE)tt.at(i));
  506. //StringCchPrintfA(tempbuff, 2, "%c", (BYTE)tt.at(i));
  507. dd.append(tempbuff);
  508. }
  509. else if (isspace((BYTE)tt.at(i)))
  510. {
  511. dd.append("+");
  512. }
  513. else
  514. {
  515. char tempbuff[4];
  516. //sprintf(tempbuff,"%%%X%X",((BYTE)tt.at(i)) >>4,((BYTE)tt.at(i)) %16);
  517. sprintf_s(tempbuff, 4, "%%%X%X", ((BYTE)tt.at(i)) >> 4, ((BYTE)tt.at(i)) % 16);
  518. //StringCchPrintfA(tempbuff, 4, "%%%X%X", ((BYTE)tt.at(i)) >> 4, ((BYTE)tt.at(i)) % 16);
  519. dd.append(tempbuff);
  520. }
  521. }
  522. return dd;
  523. }
  524. //把url GB2312解码
  525. string UrlGB2312Decode(string str)
  526. {
  527. string output = "";
  528. char tmp[2];
  529. int i = 0, idx = 0, len = str.length();
  530. while (i < len){
  531. if (str[i] == '%')
  532. {
  533. tmp[0] = str[i + 1];
  534. tmp[1] = str[i + 2];
  535. output += StrToBin(tmp);
  536. i = i + 3;
  537. }
  538. else if (str[i] == '+')
  539. {
  540. output += ' ';
  541. i++;
  542. }
  543. else{
  544. output += str[i];
  545. i++;
  546. }
  547. }
  548. return output;
  549. }
  550. //把url utf8解码
  551. string UrlUTF8Decode(string str)
  552. {
  553. string output = "";
  554. string temp = UrlGB2312Decode(str);//
  555. UTF_8ToGB2312(output, (char *)temp.data(), strlen(temp.data()));
  556. return output;
  557. }
  558. }