Dict.h 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. #pragma once
  2. #ifndef __DICT_H_
  3. #define __DICT_H_
  4. #include <vector>
  5. #include <string>
  6. #include <algorithm>
  7. #include <fstream>
  8. #include <sstream>
  9. #include "bitfunc.h"
  10. #include "Image.hpp"
  11. #include "helpfunc.h"
  12. #ifdef UNICODE
  13. using TString = std::wstring;
  14. #else
  15. using TString = std::string;
  16. #endif
  17. //#define SET_BIT(x, idx) x |= 1u << (idx)
  18. //#define GET_BIT(x, idx) (((x )>> (idx)) & 1u)
  19. const int op_dict_version = 2;
  20. #ifndef max
  21. #define max(a,b) (((a) > (b)) ? (a) : (b))
  22. #endif
  23. #ifndef min
  24. #define min(a,b) (((a) < (b)) ? (a) : (b))
  25. #endif
  26. /*
  27. 第 0 代字库
  28. */
  29. struct word_info_t
  30. {
  31. //char of word
  32. TCHAR _char[4];
  33. //char height
  34. __int16 width, height;
  35. //char bit ct
  36. __int32 bit_count;
  37. word_info_t() :width(0), height(0), bit_count(0)
  38. {
  39. memset(_char, 0, sizeof(_char));
  40. }
  41. bool operator==(const word_info_t& rhs) {
  42. return width == rhs.width && height == rhs.height;
  43. }
  44. bool operator!=(const word_info_t& rhs) {
  45. return width != rhs.width && height == rhs.height;
  46. }
  47. };
  48. struct word_t
  49. {
  50. //32 bit a col
  51. using cline_t = unsigned __int32;
  52. word_info_t info;
  53. //char col line
  54. cline_t clines[32];
  55. bool operator==(const word_t& rhs)
  56. {
  57. if (info != rhs.info)
  58. return false;
  59. for (int i = 0; i < info.width; ++i)
  60. if (clines[i] != rhs.clines[i])
  61. return false;
  62. return true;
  63. }
  64. void set_chars(const TString& s)
  65. {
  66. memcpy(info._char, s.c_str(), min(sizeof(info._char), (s.length() + 1) * sizeof(TCHAR)));
  67. }
  68. void fromDm(const TCHAR* str, int ct, const TString& w)
  69. {
  70. int bin[50] = { 0 };
  71. const int DM_DICT_HEIGTH = 11;
  72. ct = min(ct, 88);
  73. int i = 0;
  74. auto hex2bin = [](TCHAR c)
  75. {
  76. return c <= _T('9') ? c - _T('0') : c - _T('A') + 10;
  77. };
  78. while (i < ct)
  79. {
  80. bin[i / 2] = (hex2bin(str[i]) << 4) | (hex2bin(str[i + 1]));
  81. i += 2;
  82. }
  83. //
  84. int cols = (ct * 4) / DM_DICT_HEIGTH;
  85. memset(this, 0x0, sizeof(*this));
  86. for (int j = 0; j < cols; ++j) {
  87. for (int i = 0; i < 11; ++i) {
  88. int idx = j * 11 + i;
  89. if (GET_BIT(bin[idx >> 3], 7 - (idx & 7))) {
  90. SET_BIT(clines[j], 31 - i);
  91. ++info.bit_count;
  92. }
  93. }
  94. }
  95. info.height = DM_DICT_HEIGTH;
  96. info.width = cols;
  97. set_chars(w);
  98. }
  99. };
  100. /*
  101. 第 1 代字库
  102. */
  103. struct word1_info {
  104. uint8_t w, h;//max is 255 2B
  105. uint16_t bit_cnt;//max is 255*255=65025<65536 4B
  106. TCHAR name[8];//name 12B
  107. word1_info() :w(0), h(0), bit_cnt(0) {}
  108. };
  109. struct word1_t
  110. {
  111. word1_info info;
  112. vector<uint8_t> data;//size is (w*h+7)/8
  113. bool operator==(const word1_t& rhs) {
  114. if (info.w != rhs.info.w || info.h != rhs.info.h || info.bit_cnt != rhs.info.bit_cnt)
  115. return false;
  116. for (size_t i = 0;i < data.size();i++)
  117. if (data[i] != rhs.data[i])
  118. return false;
  119. return true;
  120. }
  121. void set_chars(const TString& s) {
  122. int nlen = s.length() < 8 ? s.length() : 7;
  123. memcpy(info.name, s.c_str(), nlen * 2);
  124. info.name[nlen] = L'\0';
  125. }
  126. void from_word(word_t& wd) {
  127. info.w = (uint8_t)wd.info.width;
  128. info.h = wd.info.height;
  129. init();
  130. info.bit_cnt = wd.info.bit_count;
  131. memcpy(info.name, wd.info._char, 4 * sizeof(wchar_t));
  132. info.name[3] = 0;
  133. int idx = 0;
  134. for (int x = 0; x < info.w; x++) {
  135. for (int y = 0; y < info.h; y++) {
  136. if (GET_BIT(wd.clines[x], 31 - y))
  137. SET_BIT(data[idx / 8], idx & 7);
  138. idx++;
  139. }
  140. }
  141. }
  142. void init()
  143. {
  144. data.resize((info.w * info.h + 7) / 8);
  145. std::fill(data.begin(), data.end(), 0);
  146. }
  147. };
  148. struct Dict
  149. {
  150. //v0 v1
  151. struct dict_info_t {
  152. __int16 _this_ver;//0 1
  153. __int16 _word_count;
  154. //check code=_this_ver^_word_count
  155. __int32 _check_code;
  156. dict_info_t() :_this_ver(1), _word_count(0) { _check_code = _word_count ^ _this_ver; }
  157. };
  158. dict_info_t info;
  159. Dict() {}
  160. std::vector<word1_t>words;
  161. void read_dict(const TString& s)
  162. {
  163. if (s.empty())
  164. return;
  165. if (s.find(_T(".txt")) != -1)
  166. return read_dict_dm(s);
  167. clear();
  168. std::fstream file;
  169. file.open(s, std::ios::in | std::ios::binary);
  170. if (!file.is_open())
  171. return;
  172. //读取头信息
  173. file.read((char*)&info, sizeof(info));
  174. //校验
  175. if (info._this_ver == 0 && info._check_code == (info._this_ver ^ info._word_count)) {
  176. //old dict format
  177. words.resize(info._word_count);
  178. info._this_ver = 1;
  179. word_t tmp;
  180. for (size_t i = 0; i < words.size(); i++) {
  181. file.read((char*)&tmp, sizeof(tmp));
  182. words[i].from_word(tmp);
  183. }
  184. //file.read((char*)&words[0], sizeof(word_t)*info._word_count);
  185. }
  186. else if (info._this_ver == 1 && info._check_code == (info._this_ver ^ info._word_count)) {
  187. //new dict format
  188. words.resize(info._word_count);
  189. word1_info head;
  190. for (size_t i = 0; i < words.size(); i++) {
  191. file.read((char*)&head, sizeof(head));
  192. words[i].info = head;
  193. int nlen = (head.w * head.h + 7) / 8;
  194. words[i].data.resize(nlen);
  195. file.read((char*)words[i].data.data(), nlen);
  196. }
  197. }
  198. file.close();
  199. sort_dict();
  200. }
  201. void read_dict_dm(const std::string& s)
  202. {
  203. clear();
  204. std::fstream file;
  205. file.open(s, std::ios::in);
  206. if (!file.is_open())
  207. return;
  208. //读取信息
  209. std::wstring ss;
  210. std::string str;
  211. while (std::getline(file, str)) {
  212. std::string strLocale = setlocale(LC_ALL, "");
  213. const char* chSrc = str.c_str();
  214. size_t nDestSize = mbstowcs(NULL, chSrc, 0) + 1;
  215. wchar_t* wchDest = new wchar_t[nDestSize];
  216. wmemset(wchDest, 0, nDestSize);
  217. mbstowcs(wchDest, chSrc, nDestSize);
  218. std::wstring wstrResult = wchDest;
  219. delete[]wchDest;
  220. setlocale(LC_ALL, strLocale.c_str());
  221. ss = wstrResult;
  222. size_t idx1 = ss.find(L'$');
  223. auto idx2 = ss.find(L'$', idx1 + 1);
  224. word_t wd;
  225. word1_t wd1;
  226. std::wstring name;
  227. if (idx1 != -1 && idx2 != -1) {
  228. ss[idx1] = L'0';
  229. name = ss.substr(idx1 + 1, idx2 - idx1 - 1);
  230. wd.fromDm(ss.data(), idx1, name);
  231. wd1.from_word(wd);
  232. wd1.set_chars(name);
  233. add_word(wd1);
  234. }
  235. }
  236. file.close();
  237. sort_dict();
  238. }
  239. void read_memory_dict_dm(const char* buf, size_t size)
  240. {
  241. clear();
  242. std::stringstream file;
  243. file.write(buf, size);
  244. //读取信息
  245. std::wstring ss;
  246. std::string str;
  247. while (std::getline(file, str)) {
  248. std::string strLocale = setlocale(LC_ALL, "");
  249. const char* chSrc = str.c_str();
  250. size_t nDestSize = mbstowcs(NULL, chSrc, 0) + 1;
  251. wchar_t* wchDest = new wchar_t[nDestSize];
  252. wmemset(wchDest, 0, nDestSize);
  253. mbstowcs(wchDest, chSrc, nDestSize);
  254. std::wstring wstrResult = wchDest;
  255. delete[]wchDest;
  256. setlocale(LC_ALL, strLocale.c_str());
  257. ss = wstrResult;
  258. size_t idx1 = ss.find(L'$');
  259. auto idx2 = ss.find(L'$', idx1 + 1);
  260. word_t wd;
  261. word1_t wd1;
  262. std::wstring name;
  263. if (idx1 != -1 && idx2 != -1) {
  264. ss[idx1] = L'0';
  265. name = ss.substr(idx1 + 1, idx2 - idx1 - 1);
  266. wd.fromDm(ss.data(), idx1, name);
  267. wd1.from_word(wd);
  268. wd1.set_chars(name);
  269. add_word(wd1);
  270. }
  271. }
  272. sort_dict();
  273. }
  274. void write_dict(const std::string& s)
  275. {
  276. std::fstream file;
  277. file.open(s, std::ios::out | std::ios::binary);
  278. if (!file.is_open())
  279. return;
  280. // 删除所有空白字符;
  281. auto it = words.begin();
  282. while (it != words.end())
  283. {
  284. if (it->info.name[0] == L'\0')
  285. it = words.erase(it);
  286. else
  287. ++it;
  288. }
  289. info._word_count = words.size();
  290. //设置校验
  291. info._check_code = info._this_ver ^ info._word_count;
  292. //写入信息
  293. file.write((char*)&info, sizeof(info));
  294. //写入数据
  295. for (int i = 0; i < words.size(); i++) {
  296. file.write((char*)&words[i].info, sizeof(word1_info));
  297. file.write((char*)words[i].data.data(), words[i].data.size());
  298. }
  299. file.close();
  300. }
  301. void add_word(const ImageBin& binary, const rect_t& rc) {
  302. int x2 = min(rc.x1 + 255, rc.x2);
  303. int y2 = min(rc.y1 + 255, rc.y2);
  304. word1_t word;
  305. word.info.w = x2 - rc.x1;
  306. word.info.h = y2 - rc.y1;
  307. word.info.bit_cnt = 0;
  308. word.init();
  309. //word.data.resize((word.info.w * word.info.h + 7) / 8);
  310. int idx = 0;
  311. for (int j = rc.x1; j < x2; ++j) {
  312. for (int i = rc.y1; i < y2; ++i) {
  313. auto val = binary.at(i, j);
  314. if (val == 1) {
  315. SET_BIT(word.data[idx / 8], idx & 7);
  316. ++word.info.bit_cnt;
  317. }
  318. ++idx;
  319. }
  320. }
  321. auto it = find(word);
  322. if (words.empty() || it == words.end()) {
  323. word.set_chars(L"");
  324. words.push_back(word);
  325. info._word_count = words.size();
  326. }
  327. else {//only change char
  328. //word.set_chars(c);
  329. }
  330. }
  331. void sort_dict() {
  332. //sort dict(size: big --> small ,cnt: small -->big)
  333. std::stable_sort(words.begin(), words.end(),
  334. [](const word1_t& lhs, const word1_t& rhs) {
  335. int dh = lhs.info.h - rhs.info.h;
  336. int dw = lhs.info.w - rhs.info.w;
  337. return dh > 0 || (dh == 0 && dw > 0) ||
  338. (dh == 0 && dw == 0 && lhs.info.bit_cnt < rhs.info.bit_cnt);
  339. });
  340. }
  341. void add_word(const word1_t& word) {
  342. auto it = find(word);
  343. if (words.empty() || it == words.end()) {
  344. words.push_back(word);
  345. }
  346. else {
  347. it->set_chars(word.info.name);
  348. }
  349. info._word_count = words.size();
  350. }
  351. void clear() {
  352. info._word_count = 0;
  353. words.clear();
  354. }
  355. std::vector<word1_t>::iterator find(const word1_t& word) {
  356. for (auto it = words.begin(); it != words.end(); ++it)
  357. if (*it == word)return it;
  358. return words.end();
  359. }
  360. void erase(const word1_t& word) {
  361. auto it = find(word);
  362. if (!words.empty() && it != words.end())
  363. words.erase(it);
  364. info._word_count = words.size();
  365. }
  366. int size() const {
  367. return info._word_count;
  368. }
  369. bool empty()const {
  370. return size() == 0;
  371. }
  372. };
  373. #endif