string_util.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. # -*- coding:utf-8 -*-
  2. import os, sys, time
  3. import chardet
  4. import re
  5. '''
  6. 将任意编码的str类型字符串,转码为utf-8.但是ascii编码的字符串,不会转码。
  7. '''
  8. def toUTF8Str(str1):
  9. if str1.__len__() == 0:
  10. return ""
  11. detType = chardet.detect(str1)
  12. encoding = detType['encoding']
  13. retStr = str1.decode(encoding).encode("utf-8")
  14. return retStr
  15. '''
  16. 将str类型的字符串解码,返回解码后的字符串
  17. '''
  18. def decode(str1):
  19. if str1.__len__() == 0:
  20. return ""
  21. detType = chardet.detect(str1)
  22. encoding = detType['encoding']
  23. retStr = str1.decode(encoding)
  24. return retStr
  25. '''
  26. 把拼接的文件路径转化成windows可直接打开的路径,方便调试
  27. '''
  28. def pathToWindowsPath(str_path):
  29. str_window_path = str_path.replace('/', '\\')
  30. return str_window_path
  31. '''
  32. # 描述:比较字符串s1和s2,但不区分字母的大小写
  33. 用于COR文字识别的字符串匹配判断
  34. # 返回值:相等返回True.
  35. #
  36. # '''
  37. def strcmp(str1, str2, erase = []):
  38. if type(str1) == type(u''):
  39. str1 = str(str1).encode('utf-8')
  40. else:
  41. try:
  42. str1 = str(str1).encode('utf-8')
  43. except Exception:
  44. pass
  45. if type(str2) == type(u''):
  46. str2 = str(str2).encode('utf-8')
  47. else:
  48. try:
  49. str2 = str(str2).encode('utf-8')
  50. except Exception:
  51. pass
  52. # 去除空格;
  53. str1 = str1.replace(' ', '').lower()
  54. str2 = str2.replace(' ', '').lower()
  55. # 移除指定字符;
  56. for char in erase:
  57. str1 = str1.replace(char, '').lower()
  58. str2 = str2.replace(char, '').lower()
  59. #长度判断
  60. if len(str1) != len(str2):
  61. return False
  62. # 遍历字符串
  63. result = True
  64. # 忽略的相似字符;
  65. ignore = [{'i','l','1','t','I','T'},{'o','0','O'}]
  66. cnt = len(str2)
  67. for i in range(0, cnt):
  68. if str2[i] == str1[i]:
  69. continue
  70. elif str2[i] in ignore[0] and str1[i] in ignore[0]:
  71. continue
  72. elif str2[i] in ignore[1] and str1[i] in ignore[1]:
  73. continue
  74. else:
  75. result = False
  76. break
  77. #endfor
  78. return result
  79. '''
  80. 根据传入的字符串,找出数字,组成数组输出
  81. :param :mulStr 混合了字符和数字的字符串
  82. :return : 返回数字数组.例如:['11.11', '22']
  83. '''
  84. def getDigitFromString(mulStr):
  85. # ret = re.findall("\d+\.?\d*", mulStr)
  86. # ret = re.findall(r'(-?[\d]+)', mulStr)
  87. ret = re.findall(r'(-?\d+\.?\d*)', mulStr)
  88. return ret
  89. "将字符串拆分成数组"
  90. def strToList(srcStr, reg):
  91. arr = srcStr.split(reg)
  92. retArr = []
  93. for index in range(arr.__len__()):
  94. if arr[index].__len__() > 0:
  95. retArr.append(arr[index])
  96. return retArr
  97. if __name__ == "__main__":
  98. # str1 = "中文test11.11test22"
  99. str1 = "-1sdadad32dadsa2.4afsa-6.8"
  100. # ret1 = toUTF8Str(str1)
  101. # print "test1 = ret1:", str1 == ret1, chardet.detect(str1), chardet.detect(ret1)
  102. # ret = getDigitFromString(str1)
  103. # print ret
  104. str2 = u"left, right"
  105. print str2.split(",")
  106. arr = strToList(str2, ",")
  107. print "strToArr:",arr