找到某种特定形式的字符串
找出某种特定形式的字符串
# encoding:utf-8 import time import re from functools import reduce # --------------------------class RePattern():--------------------------------- # 把某字符串转化成正则表达式,再用这个正则表达式去匹配其他字符串 class RePattern(): def __init__(self, patternStr): self.pattern = '' # 存放正则表达式 self.pre = None # 存放re.compile的返回值 self.patternLength = len(patternStr) self.bCount = 1 # 计算每个block的长度 self.block = [] # 同类型连在一起为一个block长度, ‘xXX34XXX’[1,2,2,3] self.blockType = [] # 例如‘xXX34XXX’[lower,upper,digit,upper] self.typedict = {'lower': '[a-z]{', 'upper': '[A-Z]{', 'digit': '[0-9]{', 'space': '[\s]{', 'other': '[\D\W\S]{' } self.__setPattern(patternStr) # 产生patternStr对应的正则表达式 def __setPattern(self, patternStr): ns = [] [ns.append(self.__toType(s)) for s in patternStr] # 添加一个尾巴,让__same判断 ns.append('end') reduce(self.__same, ns) # 产生正则表达式 for btype, blen in zip(self.blockType, self.block): self.pattern += self.typedict[btype]+str(blen)+'}' self.pre = re.compile(r''+self.pattern) # 把每个字符转换成对应的类型 def __toType(self,s): if s.islower(): return 'lower' elif s.isupper(): return 'upper' elif s.isdigit(): return 'digit' elif s.isspace(): return 'space' else: return 'other' # 作为reduce的参数,返回第二个参数参与下次比较 # 填充self.block 和self.blockType def __same(self, a, b): if a is b: self.bCount += 1 else: self.block.append(self.bCount) self.bCount = 1 self.blockType.append(a) return b # 参数compareStr为比较对象 def isPattern(self, compareStr): tmp = self.pre.match(compareStr) if tmp: return tmp.group() # def __str__(self): return ' block:{0}\n blockType:{1}\n pattern :{2}'\ .format(self.block, self.blockType, self.pattern) # --------------------------class RePattern(): end---------- # 从file中找出特定形式字符串 # 以strr[start:end]的形式打印匹配字符串 def pickFromFile(file, strr, start=None, end=None): count = 0 f = open(file) rp = RePattern(strr) print(rp) patternLength = rp.patternLength for line in f: compareTimes = len(line)-patternLength for n in range(compareTimes): comp = rp.isPattern(line[n:n+patternLength]) if comp: yield(comp[start:end], count) count += 1 f.close() #--------------------run------------------------- time1 = time.time() strs = pickFromFile("../data.txt", 'xxx %XX00', ) for s in strs: print(s) # 输出 'abc &BC34' 'sdf @VN03' time2 = time.time() print(time2-time1)