MOKA
/
scbc_repos


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543
							# -*- coding:utf-8 -*-
import sys, os, time

reload(sys)
sys.setdefaultencoding("utf-8")
from TST.NetOCR import *
from ssat_sdk.picture import image_util
from ssat_sdk.utils import LoggingUtil
from sat_environment import *
from utils import string_util
from picture.ocr_baidu import OCRBaidu
from picture.ocr_tesseract import OCRTes
from picture import ocr_tesseract
import json
import cv2 as cv
import numpy as np


EXP_Info = "ERR<Exp>"

#所有语言定义
OCR_LanDIC = ["chineseprc+english", "chinesetaiwan+english", "spanish", "chineseprc","chinesetaiwan","russian","french"
              ,"english","vietnamese","hebrew","thai","arabic","portuguese","german","italian","japanese","korean"]

'''
OCR Type定义：
abbyy：0~999
tesseract：1000~9999
baidu：10000~10100
'''
#泰彼OCR定义
Abbyy_LanDIC = {"chineseprc+english": "ChinesePRC+English", "chinesetaiwan+english": "ChineseTaiwan+English",
                "spanish": "Spanish",
                "chineseprc": "ChinesePRC", "chinesetaiwan": "ChineseTaiwan", "russian": "Russian", "french": "French",
                "english": "English", "vietnamese": "Vietnamese", "hebrew": "Hebrew", "thai": "Thai"
                }
ABBYY_BASIC = 0
ABBYY_MAX = 999
ABBYY_CONTRAST_ENABLE = ABBYY_BASIC
ABBYY_CONTRAST_DISABLE = ABBYY_BASIC+1
ABBYY_NORMAL = ABBYY_BASIC+2
ABBYY_CONVERSION_ENGINE_ENABLE = ABBYY_BASIC+3
ABBYY_INVERT_IMAGE_ENABLE = ABBYY_BASIC+4
ABBYY_CONVERSION_AND_CONTRAST_ENABLE = ABBYY_BASIC+5
ABBYY_ENGLISH_RECOMMENDATION_MODE = 253
Abbyy_TypeList = [ABBYY_CONTRAST_ENABLE,ABBYY_CONTRAST_DISABLE,ABBYY_NORMAL,ABBYY_CONVERSION_ENGINE_ENABLE
                  ,ABBYY_INVERT_IMAGE_ENABLE,ABBYY_CONVERSION_AND_CONTRAST_ENABLE,ABBYY_ENGLISH_RECOMMENDATION_MODE]

#Tesseract OCR定义。没有Type
Tes_LanDIC = ocr_tesseract.Tes_LanDIC
Tes_BASIC = 1000
Tes_MAX = 9999
Tes_TypeList = [Tes_BASIC]

#百度 OCR定义。
Baidu_LanDIC = {"chineseprc+english": "CHN_ENG",
                "spanish": "SPA","russian": "RUS",
                "french": "FRE",
                "english": "ENG", "portuguese": "POR", "german": "GER", "italian": "ITA", "japanese": "JAP", "korean": "KOR"}
Baidu_BASIC = 10000
Baidu_MAX = 10100
Baidu_General = Baidu_BASIC
Baidu_Accurate = Baidu_BASIC + 1
Baidu_TypeList=[Baidu_General,Baidu_Accurate]

OCR_TMP_DIR = getOCRTmpDir()
OCR_ERR_DIR = getOCRErrDir()
if not os.path.exists(OCR_TMP_DIR):
    os.mkdir(OCR_TMP_DIR)
if not os.path.exists(OCR_ERR_DIR):
    os.mkdir(OCR_ERR_DIR)

class OCRConvert():
    OCR_PRODUCT_BAIDU = "baidu"
    OCR_PRODUCT_ABBYY = "abbyy"
    OCR_PRODUCT_TES = "tesseract"
    def __init__(self):
        self.timeout = 3
        self.ocrBaidu = OCRBaidu()
        self.ocrTes = OCRTes()

    def setTimeOut(self, seconds):
        self.timeout = seconds

    '''
    根据自然语言描述，和OCR Type，返回OCR识别语言的字符串定义
    :param language: chineseprc+english/chinesetaiwan+english/spanish/chineseprc/chinesetaiwan/russian/french/english/vietnamese/hebrew
                    /portuguese/german/italian/japanese/korean
    :param type: OCR识别类型编号
    :return OCR语言类别定义的字符串.如果在字典库找不到，直接返回传入的language。
    '''
    def getOCRLaParam(self, language, type):
        if type <ABBYY_MAX and type >= ABBYY_BASIC:
            if Abbyy_LanDIC.has_key(language.lower()):
                return Abbyy_LanDIC[language.lower()]
            else:
                return language
        elif type <Tes_MAX and type >= Tes_BASIC:
            if Tes_LanDIC.has_key(language.lower()):
                return Tes_LanDIC[language.lower()]
            else:
                return language
        elif type < Baidu_MAX and type>=Baidu_BASIC:
            if Baidu_LanDIC.has_key(language.lower()):
                return Baidu_LanDIC[language.lower()]
            else:
                return language
        else:
            return language

    """
    根据传入的语言类型（无论自然语言类型，或者ocr语言类型），转换成目标自然语言类型
    """
    def getHumanLan(self, language, type):
        if type < ABBYY_MAX and type >= ABBYY_BASIC:
            if Abbyy_LanDIC.has_key(language):
                return language
            for humanLan in Abbyy_LanDIC:
                # print "getHumanLan:", humanLan
                if string_util.strcmp(Abbyy_LanDIC[humanLan], language):
                    return humanLan
            return ""
        elif type < Tes_MAX and type >= Tes_BASIC:
            if Tes_LanDIC.has_key(language):
                return language
            for humanLan in Tes_LanDIC:
                # print "getHumanLan:",  humanLan
                if string_util.strcmp(Tes_LanDIC[humanLan], language):
                    return humanLan
            return ""
        elif type < Baidu_MAX and type >= Baidu_BASIC:
            if Baidu_LanDIC.has_key(language):
                return language
            for humanLan in Baidu_LanDIC:
                # print "getHumanLan:",  humanLan
                if string_util.strcmp(Baidu_LanDIC[humanLan], language):
                    return humanLan
            return ""
        else:
            return ""

    '''
    根据传入的OCR 类型，返回厂家名字
    :param type:OCR识别类型编号
    :return OCR厂家名称
    '''
    def getOCRProductByType(self, type):
        if type < ABBYY_MAX and type >= ABBYY_BASIC:
            return self.OCR_PRODUCT_ABBYY
        elif type < Tes_MAX and type >= Tes_BASIC:
            return self.OCR_PRODUCT_TES
        elif type < Baidu_MAX and type >= Baidu_BASIC:
            return self.OCR_PRODUCT_BAIDU
        else:
            return ""

    def sendPicToServer(self, picPath, lan, type):
        try:
            ocr = NetOCR(getOCRIpAddr(), getOCRPort(), picPath, lan, type)
            # ocr.setTimeOut(self.timeout)
            return ocr
        except Exception, e:
            print "OCR", u"SCBC OCR连接失败,Err:" ,e
            return None

    '''
    在getStr函数基础上，添加了图片处理参数。处理图片后，再使用图片识别。
    :param imgProcParams: 字典：{"Threshold":[127,250, 阈值类型], "contrast":[], "LaplaceSharp":[], "Noisy":[]}
    :param picPath: 图片路径
    :param lan :OCR识别用的语言类别，不同OCR产品的定义不一样
    :param type:OCR识别的类型编号
    :return 识别后的字符串。同getStr函数
    '''
    def getStrWithImgProcess(self, picPath, imgProcParams, lan, type, reconTimes = 5):
        print "getStrWithImgProcess, param:", picPath, imgProcParams,lan,type,reconTimes
        destPicPath = os.path.join(OCR_TMP_DIR, "ocr_"+str(time.time())+".png")
        img = self.handleImage(picPath, imgProcParams)
        cv.imwrite(destPicPath, img)
        return self.getStr(destPicPath, lan, type)

    '''
    根据图片处理参数，处理图片，返回image对象
    :param imgProcParams: 字典：{"Threshold":[127,250, 阈值类型], "contrast":[], "LaplaceSharp":[], "Noisy":[]}
    :param picPath: 图片路径
    :return 返回图片image对象
    '''
    def handleImage(self, picPath, imgProcParams):
        print u"handleImage,imgProcParams:", imgProcParams
        srcImg = cv.imread(picPath)
        destPicPath = picPath
        img = srcImg
        if imgProcParams.has_key("Threshold"):
            img = image_util.saveThresholdPicImg(srcImg, \
                        imgProcParams["Threshold"][0], imgProcParams["Threshold"][1], imgProcParams["Threshold"][2])
        return img

    '''
    根据传入的图片路径，获取图片的字符串。
    :param picPath.图片的绝对路径
    :param lan. OCR识别的文字语言类别。
        type<10000:
            ChinesePRC+English
            ChineseTaiwan+English
            Spanish
            ChinesePRC
            ChineseTaiwan
            Russian
            French
            English
            Vietnamese
            Hebrew
            Thai
    :param type. OCR识别文字时的识别模型编号。
    :param reconTimes. OCR识别文字异常时，重新尝试识别的次数，默认为5次，最高为10次。
    :return str. 返回识别的字符串。未识别到，返回""，如果type未10000,10001表示采用百度OCR，返回字符串数组
                如果属于字符识别异常：返回字符串"ERR<Exp>"
    '''
    def getStr(self, picPath, lan, type, reconTimes = 5):
        print "getStr, param:", picPath, lan, type, reconTimes
        lan = self.getOCRLaParam(lan, type)
        if type < ABBYY_MAX and type >= ABBYY_BASIC:
            ocr = self.sendPicToServer(picPath, lan, type)
            if (ocr == None):
                return ""
            try:
                LoggingUtil.printLog("OCR", u"泰彼 OCR")
                string = ocr.getStr()
                # print string
                return string_util.toUTF8Str(string)
            except Exception, e:
                print u"OCR", u"获取文字失败.error:" ,e
                return EXP_Info
            finally:
                ocr.close()
        elif type >=Tes_BASIC and type < Tes_MAX:
            return self.ocrTes.getStr(picPath, lan, type)
        else:
            # 超出范围的重连次数，都默认为10次;
            if reconTimes > 10 or reconTimes < 0:
                reconTimes = 10
            # by zippo 把百度的返回数组转换成字符串，统一返回值
            allStr = self.ocr_BaiduGS(picPath, type - 10000, lan)
            while allStr is None or allStr == EXP_Info and reconTimes > 0:
                LoggingUtil.printLog("OCR", u'百度OCR重连')
                time.sleep(1)
                reconTimes -= 1
                allStr = self.ocr_BaiduGS(picPath, type - 10000, lan)
            
            return EXP_Info if allStr is None else allStr
            # return self.ocr_Baidu(picPath, type-10000, lan)

    # 字符串比对
    def cmpOcrStr(self, ocrStr, stdStr, erase = [], picPath=None):
        # print u"OCR_Convert:cmpOcrStr start param:", ocrStr, stdStr, erase, picPath if type(picPath) == type('') else 'path is imgObj'
        if type(ocrStr) == type(u''):
            ocrStr = str(ocrStr).encode('utf-8')
        else:
            try:
                ocrStr = str(ocrStr).encode('utf-8')
            except Exception:
                pass

        if type(stdStr) == type(u''):
            stdStr = str(stdStr).encode('utf-8')
        else:
            try:
                stdStr = str(stdStr).encode('utf-8')
            except Exception:
                pass
        
        # 去除空格;
        ocrStr = ocrStr.replace(' ', '').lower()
        stdStr = stdStr.replace(' ', '').lower()
        # 移除指定字符;
        for char in erase:
            ocrStr = ocrStr.replace(char, '').lower()
            stdStr = stdStr.replace(char, '').lower()
            
        #长度判断
        if len(ocrStr) != len(stdStr):
            self.saveOCRErr(ocrStr, stdStr, picPath)
            return False

        # 遍历字符串
        result = True
        # 忽略的相似字符;
        ignore = [{'i','l','1','t','I','T'},{'o','0','O'}]
        cnt = len(stdStr)
        for i in range(0, cnt):
            if stdStr[i] == ocrStr[i]:
                continue
            elif stdStr[i] in ignore[0] and ocrStr[i] in ignore[0]:
                continue
            elif stdStr[i] in ignore[1] and ocrStr[i] in ignore[1]:
                continue
            else:
                result = False
                break
        #endfor
        if result is False:
            self.saveOCRErr(ocrStr,stdStr, picPath)
        return result
    #end

    def saveOCRErr(self, ocrStr, stdStr, pic):
        if pic is None or pic == '':
            print u"Warn:Save OCR Error picture fail. <pic> is None or Empty"
            return
        destPicName = unicode(ocrStr + "_" + stdStr + ".png")
        destPic = os.path.join(OCR_ERR_DIR, destPicName)
        try:
            if type(pic) == type(""):# 如果是路径;
                shutil.copyfile(pic, destPic)
            else:# 如果是图像数组;numpy.ndarray
                cv.imwrite(destPic,pic)
        except Exception,e:
            print u"Warn:Save  OCR Error picture fail.",e.message

    '''
    根据传入的目标字符串stdStr，如果指定的OCR type可以识别到文字，则返回True和OCR字符串。
    如果指定的OCR type识别不到文字，遍历所有OCR type方式识别文字，返回最终遍历结果。
    :param stdStr : 目标字符串
    :param picPath: 图片路径
    :param lan :OCR识别用的语言类别，不同OCR产品的定义不一样
    :param type:OCR识别的类型编号
    :param imgProcParams: 字典：{"Threshold":[127,250, 阈值类型], "contrast":[], "LaplaceSharp":[], "Noisy":[]}
    :param erase：erase 字符数组，文字比对时，需过滤掉的字符。
    :return boolean,string: boolean标识是否成功（ture/false)，string 是指定的OCR type识别到的文字。
    '''
    def findPicStr(self, stdStr, picPath, lan, type, imgProcParams={}, erase = []):
        #匹配自然语言类型
        humanLan = self.getHumanLan(lan, type)
        ocrStr = self.getStrWithImgProcess(picPath, imgProcParams, lan, type, reconTimes = 1)
        ret = self.cmpOcrStr(ocrStr, stdStr,erase, picPath)
        return ret,ocrStr
        if ret is True:
            return True,ocrStr
        for typeA in Abbyy_TypeList:
            if type <> typeA:
                ocrLan = self.getOCRLaParam(humanLan, typeA)
                ocrStr = self.getStrWithImgProcess(picPath, imgProcParams, ocrLan, typeA, reconTimes=1)
                ret = self.cmpOcrStr(ocrStr, stdStr, erase, "")
                if ret is True:
                    return True, ocrStr
        for typeB in Baidu_TypeList:
            if type <> typeB:
                ocrLan = self.getOCRLaParam(humanLan, typeB)
                ocrStr = self.getStrWithImgProcess(picPath, imgProcParams, ocrLan, typeB, reconTimes=1)
                ret = self.cmpOcrStr(ocrStr, stdStr, erase, "")
                if ret is True:
                    return True, ocrStr
        return False, ocrStr

    # 指定ocr列表，遍历查找;
    def findPicStrEx(self, stdStr, picPath, list_ocr, imgProcParams={}, erase = []):
        result,ocrStr = False,''
        for item in list_ocr:
            ocrStr = self.getStrWithImgProcess(picPath, imgProcParams, item["lan"], item["type"], reconTimes = 1)
            if stdStr.lower() in ocrStr.lower():
                result = True
                break
        return result, ocrStr
    '''
    根据传入的图片路径，获取图片的字符串。
    :param picPath.图片的绝对路径
    :param lan. OCR识别的文字语言类别。
    :param type. OCR识别文字时的识别模型编号。
    :param area. OCR识别的图片区域。要求是图片刚好适配的矩形框区域。
    :return str. 返回识别的字符串。未识别到，返回"".
    '''

    def getPositionStr(self, picPath, lan, type, area):
        ocr = self.sendPicToServer(picPath, lan, type)
        if (ocr == None):
            return ""
        try:
            startX, startY, endX, endY = area
            string = ocr.getPositionStr(startX, startY, endX, endY)
            return string_util.toUTF8Str(string)
        except Exception, e:
            print "OCR", u"获取文字失败.error:",e
            return EXP_Info
        finally:
            ocr.close()

    '''
   根据传入的图片路径，获取图片的字符串。
   :param picPath.图片的绝对路径
   :param lan. OCR识别的文字语言类别。
   :param type. OCR识别文字时的识别模型编号。
   :param keyword. OCR需要寻找的文字。
   :return str. 返回识别的字符串。未识别到，返回[-1,-1,-1,-1]，
   '''

    def getStrPosition(self, picPath, lan, type, keyword):
        ocr = self.sendPicToServer(picPath, lan, type)
        if (ocr == None):
            return [-1, -1, -1, -1]
        try:
            area = ocr.getStrPosition(keyword)
            return area
        except Exception, e:
            print "OCR", u"获取文字位置失败.error:",e
            return [-1, -1, -1, -1]
        finally:
            ocr.close()

    '''
   根据传入的图片路径，获取图片的字符串。
   :param picPath.图片的绝对路径
   :param lan. OCR识别的文字语言类别。
   :param type. OCR识别文字时的识别模型编号。
   :param area. OCR识别的图片区域。
   :return str. 返回识别的字符串。未识别到，返回""，
   '''

    def getAreaStr(self, picPath, lan, type, area):
        ocr = self.sendPicToServer(picPath, lan, type)
        if (ocr == None):
            return ""
        try:
            startX, startY, endX, endY = area
            string = ocr.getAreaStr(startX, startY, endX, endY)
            return string_util.toUTF8Str(string)
        except Exception, e:
            print "OCR", u'获取文字失败.error:',e
            return EXP_Info
        finally:
            ocr.close()

    def close(self):
        pass

    '''
    :param language:
        识别语言类型，默认为CHN_ENG。可选值包括：
        - CHN_ENG#中英文混合；
        - ENG#英文；
        - POR#葡萄牙语；
        - FRE#法语；
        - GER#德语；
        - ITA#意大利语；
        - SPA#西班牙语；
        - RUS#俄语；
        - JAP#日语；
        - KOR#韩语
    :param type:
        0 basicGeneral;1 basicAccurate 高精度通用文字; 选择高精度时，language参数失效。
    :return :
        正常返回字符串数组，如果异常，返回['ERR<Exp>']
    '''

    def ocr_Baidu(self, pic_path, type, language="CHN_ENG"):
        if type == 0:
            ret = self.ocrBaidu.basicGeneral(pic_path, language)
            if ret is None:
                return [EXP_Info]
            return ret
        elif type == 1:
            ret = self.ocrBaidu.basicAccurate(pic_path, language)
            if ret is None:
                return [EXP_Info]
            return ret

    '''
    :param language:
        识别语言类型，默认为CHN_ENG。可选值包括：
        - CHN_ENG#中英文混合；
        - ENG#英文；
        - POR#葡萄牙语；
        - FRE#法语；
        - GER#德语；
        - ITA#意大利语；
        - SPA#西班牙语；
        - RUS#俄语；
        - JAP#日语；
        - KOR#韩语
    :param type:
        0 basicGeneral;1 basicAccurate 高精度通用文字; 选择高精度时，language参数失效。
    '''
    def ocr_BaiduGS(self, pic_path, type, language="CHN_ENG"):
        str1 = ""
        strList = self.ocr_Baidu(pic_path, type, language="CHN_ENG")
        for index in range(strList.__len__()):
            strT = strList[index]
            if index == 0:
                str1 = strT
                continue
            str1 += " "+strT
        return str1


if __name__ == "__main__":
    if 0:
        # DIR = r"D:\ocr_err\\"
        DIR = r"D:\temp-pic\gbk\\"
        # picPath = DIR + "mi_4.png"
        # picPath = DIR + "ocr_2.png"
        picPath = DIR + "546473243246047508.png"
        # DIR = u"D:/temp-pic/"
        # picPath = DIR + "20180604113525.png"
        # pic_path = r"D:\ocr_err\checkChannelList_crop.jpg"
        pic_path = r"D:\ocr_err\11.png"
        pic_path = r"D:\ocr_err\mi_1.png"
        pic_path = r"D:\ocr_err\mi\5.png"
        ocrCon = OCRConvert()
        # allStr = ocrCon.getStr(pic_path,"english", 1000, -1)
        allStr = ocrCon.getStr(pic_path,"chineseprc", 1000, -1)
        print "ocr:",allStr
        sys.exit(0)
    if 0:
        # s = u'林'
        # print s.encode('utf-8')
        # print s.encode('gbk')
        ocrCon = OCRConvert()
        ocrStr = u"1你好T0i"
        stdStr = u"i你好io1"
        result = ocrCon.cmpOcrStr(ocrStr, stdStr)
        print u'结果',result

        ocrStr = "1你好T0i"
        stdStr = "i你好io1"
        result = ocrCon.cmpOcrStr(ocrStr, stdStr)
        print u'结果',result

        ocrStr = "1你好T0i"
        stdStr = u"i你好io1"
        result = ocrCon.cmpOcrStr(ocrStr, stdStr)
        print u'结果',result

        # ocrStr = unicode("1你好T0i",'gbk')
        # stdStr = u"i你好io1"
        # print ocrStr,stdStr,type(ocrStr),type(stdStr)
        # result = ocrCon.cmpOcrStr(ocrStr, stdStr)
        # print u'结果',result
    if 1:
        ocr = OCRConvert()

        pic = r'D:\temp-pic\UI\NT72-1\home.png'
        # strAll = ocr.getStr(pic, "english", 253)
        # print "strall:",strAll
        position = ocr.getStrPosition(pic, "english", 253, "hdmi")
        print "position:",position