123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416 |
- #pragma once
- #ifndef __DICT_H_
- #define __DICT_H_
- #include <vector>
- #include <string>
- #include <algorithm>
- #include <fstream>
- #include <sstream>
- #include "bitfunc.h"
- #include "Image.hpp"
- #include "helpfunc.h"
- #ifdef UNICODE
- using TString = std::wstring;
- #else
- using TString = std::string;
- #endif
- //#define SET_BIT(x, idx) x |= 1u << (idx)
- //#define GET_BIT(x, idx) (((x )>> (idx)) & 1u)
- const int op_dict_version = 2;
- #ifndef max
- #define max(a,b) (((a) > (b)) ? (a) : (b))
- #endif
- #ifndef min
- #define min(a,b) (((a) < (b)) ? (a) : (b))
- #endif
- /*
- 第 0 代字库
- */
- struct word_info_t
- {
- //char of word
- TCHAR _char[4];
- //char height
- __int16 width, height;
- //char bit ct
- __int32 bit_count;
- word_info_t() :width(0), height(0), bit_count(0)
- {
- memset(_char, 0, sizeof(_char));
- }
- bool operator==(const word_info_t& rhs) {
- return width == rhs.width && height == rhs.height;
- }
- bool operator!=(const word_info_t& rhs) {
- return width != rhs.width && height == rhs.height;
- }
- };
- struct word_t
- {
- //32 bit a col
- using cline_t = unsigned __int32;
- word_info_t info;
- //char col line
- cline_t clines[32];
- bool operator==(const word_t& rhs)
- {
- if (info != rhs.info)
- return false;
- for (int i = 0; i < info.width; ++i)
- if (clines[i] != rhs.clines[i])
- return false;
- return true;
- }
- void set_chars(const TString& s)
- {
- memcpy(info._char, s.c_str(), min(sizeof(info._char), (s.length() + 1) * sizeof(TCHAR)));
- }
- void fromDm(const TCHAR* str, int ct, const TString& w)
- {
- int bin[50] = { 0 };
- const int DM_DICT_HEIGTH = 11;
- ct = min(ct, 88);
- int i = 0;
- auto hex2bin = [](TCHAR c)
- {
- return c <= _T('9') ? c - _T('0') : c - _T('A') + 10;
- };
- while (i < ct)
- {
- bin[i / 2] = (hex2bin(str[i]) << 4) | (hex2bin(str[i + 1]));
- i += 2;
- }
- //
- int cols = (ct * 4) / DM_DICT_HEIGTH;
- memset(this, 0x0, sizeof(*this));
- for (int j = 0; j < cols; ++j) {
- for (int i = 0; i < 11; ++i) {
- int idx = j * 11 + i;
- if (GET_BIT(bin[idx >> 3], 7 - (idx & 7))) {
- SET_BIT(clines[j], 31 - i);
- ++info.bit_count;
- }
- }
- }
- info.height = DM_DICT_HEIGTH;
- info.width = cols;
- set_chars(w);
- }
- };
- /*
- 第 1 代字库
- */
- struct word1_info {
- uint8_t w, h;//max is 255 2B
- uint16_t bit_cnt;//max is 255*255=65025<65536 4B
- TCHAR name[8];//name 12B
- word1_info() :w(0), h(0), bit_cnt(0) {}
- };
- struct word1_t
- {
- word1_info info;
- vector<uint8_t> data;//size is (w*h+7)/8
- bool operator==(const word1_t& rhs) {
- if (info.w != rhs.info.w || info.h != rhs.info.h || info.bit_cnt != rhs.info.bit_cnt)
- return false;
- for (size_t i = 0;i < data.size();i++)
- if (data[i] != rhs.data[i])
- return false;
- return true;
- }
- void set_chars(const TString& s) {
- int nlen = s.length() < 8 ? s.length() : 7;
- memcpy(info.name, s.c_str(), nlen * 2);
- info.name[nlen] = L'\0';
- }
- void from_word(word_t& wd) {
- info.w = (uint8_t)wd.info.width;
- info.h = wd.info.height;
- init();
- info.bit_cnt = wd.info.bit_count;
- memcpy(info.name, wd.info._char, 4 * sizeof(wchar_t));
- info.name[3] = 0;
- int idx = 0;
- for (int x = 0; x < info.w; x++) {
- for (int y = 0; y < info.h; y++) {
- if (GET_BIT(wd.clines[x], 31 - y))
- SET_BIT(data[idx / 8], idx & 7);
- idx++;
- }
- }
- }
- void init()
- {
- data.resize((info.w * info.h + 7) / 8);
- std::fill(data.begin(), data.end(), 0);
- }
- };
- struct Dict
- {
- //v0 v1
- struct dict_info_t {
- __int16 _this_ver;//0 1
- __int16 _word_count;
- //check code=_this_ver^_word_count
- __int32 _check_code;
- dict_info_t() :_this_ver(1), _word_count(0) { _check_code = _word_count ^ _this_ver; }
- };
- dict_info_t info;
- Dict() {}
- std::vector<word1_t>words;
- void read_dict(const TString& s)
- {
- if (s.empty())
- return;
- if (s.find(_T(".txt")) != -1)
- return read_dict_dm(s);
- clear();
- std::fstream file;
- file.open(s, std::ios::in | std::ios::binary);
- if (!file.is_open())
- return;
- //读取头信息
- file.read((char*)&info, sizeof(info));
- //校验
- if (info._this_ver == 0 && info._check_code == (info._this_ver ^ info._word_count)) {
- //old dict format
- words.resize(info._word_count);
- info._this_ver = 1;
- word_t tmp;
- for (size_t i = 0; i < words.size(); i++) {
- file.read((char*)&tmp, sizeof(tmp));
- words[i].from_word(tmp);
- }
- //file.read((char*)&words[0], sizeof(word_t)*info._word_count);
- }
- else if (info._this_ver == 1 && info._check_code == (info._this_ver ^ info._word_count)) {
- //new dict format
- words.resize(info._word_count);
- word1_info head;
- for (size_t i = 0; i < words.size(); i++) {
- file.read((char*)&head, sizeof(head));
- words[i].info = head;
- int nlen = (head.w * head.h + 7) / 8;
- words[i].data.resize(nlen);
- file.read((char*)words[i].data.data(), nlen);
- }
- }
- file.close();
- sort_dict();
- }
- void read_dict_dm(const std::string& s)
- {
- clear();
- std::fstream file;
- file.open(s, std::ios::in);
- if (!file.is_open())
- return;
- //读取信息
- std::wstring ss;
- std::string str;
- while (std::getline(file, str)) {
- std::string strLocale = setlocale(LC_ALL, "");
- const char* chSrc = str.c_str();
- size_t nDestSize = mbstowcs(NULL, chSrc, 0) + 1;
- wchar_t* wchDest = new wchar_t[nDestSize];
- wmemset(wchDest, 0, nDestSize);
- mbstowcs(wchDest, chSrc, nDestSize);
- std::wstring wstrResult = wchDest;
- delete[]wchDest;
- setlocale(LC_ALL, strLocale.c_str());
- ss = wstrResult;
- size_t idx1 = ss.find(L'$');
- auto idx2 = ss.find(L'$', idx1 + 1);
- word_t wd;
- word1_t wd1;
- std::wstring name;
- if (idx1 != -1 && idx2 != -1) {
- ss[idx1] = L'0';
- name = ss.substr(idx1 + 1, idx2 - idx1 - 1);
- wd.fromDm(ss.data(), idx1, name);
- wd1.from_word(wd);
- wd1.set_chars(name);
- add_word(wd1);
- }
- }
- file.close();
- sort_dict();
- }
- void read_memory_dict_dm(const char* buf, size_t size)
- {
- clear();
- std::stringstream file;
- file.write(buf, size);
- //读取信息
- std::wstring ss;
- std::string str;
- while (std::getline(file, str)) {
- std::string strLocale = setlocale(LC_ALL, "");
- const char* chSrc = str.c_str();
- size_t nDestSize = mbstowcs(NULL, chSrc, 0) + 1;
- wchar_t* wchDest = new wchar_t[nDestSize];
- wmemset(wchDest, 0, nDestSize);
- mbstowcs(wchDest, chSrc, nDestSize);
- std::wstring wstrResult = wchDest;
- delete[]wchDest;
- setlocale(LC_ALL, strLocale.c_str());
- ss = wstrResult;
- size_t idx1 = ss.find(L'$');
- auto idx2 = ss.find(L'$', idx1 + 1);
- word_t wd;
- word1_t wd1;
- std::wstring name;
- if (idx1 != -1 && idx2 != -1) {
- ss[idx1] = L'0';
- name = ss.substr(idx1 + 1, idx2 - idx1 - 1);
- wd.fromDm(ss.data(), idx1, name);
- wd1.from_word(wd);
- wd1.set_chars(name);
- add_word(wd1);
- }
- }
- sort_dict();
- }
- void write_dict(const std::string& s)
- {
- std::fstream file;
- file.open(s, std::ios::out | std::ios::binary);
- if (!file.is_open())
- return;
- // 删除所有空白字符;
- auto it = words.begin();
- while (it != words.end())
- {
- if (it->info.name[0] == L'\0')
- it = words.erase(it);
- else
- ++it;
- }
- info._word_count = words.size();
- //设置校验
- info._check_code = info._this_ver ^ info._word_count;
- //写入信息
- file.write((char*)&info, sizeof(info));
- //写入数据
- for (int i = 0; i < words.size(); i++) {
- file.write((char*)&words[i].info, sizeof(word1_info));
- file.write((char*)words[i].data.data(), words[i].data.size());
- }
- file.close();
- }
- void add_word(const ImageBin& binary, const rect_t& rc) {
- int x2 = min(rc.x1 + 255, rc.x2);
- int y2 = min(rc.y1 + 255, rc.y2);
- word1_t word;
- word.info.w = x2 - rc.x1;
- word.info.h = y2 - rc.y1;
- word.info.bit_cnt = 0;
- word.init();
- //word.data.resize((word.info.w * word.info.h + 7) / 8);
- int idx = 0;
- for (int j = rc.x1; j < x2; ++j) {
- for (int i = rc.y1; i < y2; ++i) {
- auto val = binary.at(i, j);
- if (val == 1) {
- SET_BIT(word.data[idx / 8], idx & 7);
- ++word.info.bit_cnt;
- }
- ++idx;
- }
- }
- auto it = find(word);
- if (words.empty() || it == words.end()) {
- word.set_chars(L"");
- words.push_back(word);
- info._word_count = words.size();
- }
- else {//only change char
- //word.set_chars(c);
- }
- }
- void sort_dict() {
- //sort dict(size: big --> small ,cnt: small -->big)
- std::stable_sort(words.begin(), words.end(),
- [](const word1_t& lhs, const word1_t& rhs) {
- int dh = lhs.info.h - rhs.info.h;
- int dw = lhs.info.w - rhs.info.w;
- return dh > 0 || (dh == 0 && dw > 0) ||
- (dh == 0 && dw == 0 && lhs.info.bit_cnt < rhs.info.bit_cnt);
- });
- }
- void add_word(const word1_t& word) {
- auto it = find(word);
- if (words.empty() || it == words.end()) {
- words.push_back(word);
- }
- else {
- it->set_chars(word.info.name);
- }
- info._word_count = words.size();
- }
- void clear() {
- info._word_count = 0;
- words.clear();
- }
- std::vector<word1_t>::iterator find(const word1_t& word) {
- for (auto it = words.begin(); it != words.end(); ++it)
- if (*it == word)return it;
- return words.end();
- }
- void erase(const word1_t& word) {
- auto it = find(word);
- if (!words.empty() && it != words.end())
- words.erase(it);
- info._word_count = words.size();
- }
- int size() const {
- return info._word_count;
- }
- bool empty()const {
- return size() == 0;
- }
- };
- #endif
|