from itertools import islice
import time
NCTPATH = './wubixinshiji.dict.yaml'
def read_code_table():
nc_ct = {}
with open(NCTPATH, 'r', encoding='utf-8') as f:
for line in islice(f.readlines(), 81, None):
str = line.strip().split('\t')
nc_ct[str[0]] = str[1]
f.close()
print("已读取新世纪码表, 共有 %d 项 " % len(nc_ct))
return nc_ct
def list_code_table():
list_nc_ct = []
with open(NCTPATH, 'r', encoding='utf-8') as f:
for line in islice(f.readlines(), 81, None):
str = line.strip().split('\t')
tmp=(str[0], str[1])
list_nc_ct.append(tmp)
f.close()
print("已读取新世纪码表, 共有 %d 项 " % len(list_nc_ct))
return list_nc_ct
def single_table():
single_ct = {}
rc=0
dc=0
cc=0
with open(NCTPATH, 'r', encoding='utf-8') as f:
for line in islice(f.readlines(), 81, None):
str = line.strip().split('\t')
if len(str[0]) == 1:
if len(str[1])>1:
if str[0] not in single_ct:
single_ct[str[0]] = str[1]
else:
rc+=1
else:
dc+=1
else:
cc+=1
f.close()
print("忽略 82 行,删掉词语 %d 条,独码 %d 条,重复 %d 条,已生成单字码表共有 %d 项" %(cc,dc,rc,len(single_ct)))
return single_ct
NCT = read_code_table()
listNCT=list_code_table()
SCT = single_table()
def filter_code_table(table_path):
dt = {}
lc = 0
hc = 0
sc = 0
with open(table_path, 'r', encoding='utf-8') as f:
for line in islice(f.readlines(), 24, None):
str = line.strip().split('\t')
if str[0] not in NCT:
if len(str[0]) > 1:
dt[str[0]] = str[1]
else:
sc += 1
else:
hc += 1
lc += 1
f.close()
with open(u'./out.txt', 'a+', encoding='utf-8') as o:
for key, value in dt.items():
o.write('%s\t%s\n' % (key, value))
o.close()
print("处理词库文件 %s 完毕, 共处理 %d 行, %d 个项已经存在于新世纪五笔码表中, 保存了 %d 行,舍弃了 %d 个单独字符。" % (table_path, lc, hc, len(dt), sc))
def query_code(s):
if s in SCT:
return SCT[s]
def get_code(str):
len_str = len(str)
if len_str == 1:
return query_code(str)
if len_str == 2:
f = str[0]
s = str[1]
fc = query_code(f)
sc = query_code(s)
return fc[:2] + sc[:2]
if len_str == 3:
s1 = str[0]
s2 = str[1]
s3 = str[2]
s1c = query_code(s1)
s2c = query_code(s2)
s3c = query_code(s3)
return s1c[0] + s2c[0] + s3c[:2]
if len_str > 3:
s1 = str[0]
s2 = str[1]
s3 = str[2]
se = str[len_str - 1]
s1c = query_code(s1)
s2c = query_code(s2)
s3c = query_code(s3)
sec = query_code(se)
return s1c[0] + s2c[0] + s3c[0] + sec[0]
def take_second(e):
return e[1]
if __name__ == '__main__':
start = time.time()
filter_code_table('./wubi98_ci.dict.yaml')
filter_code_table('./wubi98_S.dict.yaml')
filter_code_table('./wubi98_U.dict.yaml')
new_code_table = {}
lc = 0
cc = 0
final_code_table=[]
with open(u'./out.txt', 'r', encoding='utf-8') as f:
for line in f:
str = line.strip().split('\t')
new_code_table[str[0]] = get_code(str[0])
lc += 1
for n in listNCT:
final_code_table.append(n)
for key,value in new_code_table.items():
m=(key,value)
final_code_table.append(m)
final_code_table.sort(key=take_second)
f.close()
with open(NCTPATH, 'r', encoding='utf-8') as f,open(u'./sorted.txt', 'w', encoding='utf-8') as o:
for line in islice(f.readlines(), 0, 81):
o.write(line)
for c in final_code_table:
o.write('%s\t%s\n' % (c[0], c[1]))
cc+=1
print("新世纪五笔原有 %d 条,新添加了 %d 条,整合后为 %d 条,从98五笔词库中提取了 %d 条词语(未查重),现写入 %d 条词语。" % (len(listNCT),len(new_code_table),len(final_code_table),lc, cc))
end = time.time()
print("程序用时:%fs" % (end - start))
程序输出
已读取新世纪码表, 共有 107396 项
已读取新世纪码表, 共有 112061 项
忽略 82 行,删掉词语 79459 条,独码 25 条,重复 4491 条,已生成单字码表共有 28086 项
处理词库文件 ./wubi98_ci.dict.yaml 完毕, 共处理 108547 行, 92410 个项已经存在于新世纪五笔码表中, 保存了 16047 行,舍弃了 90 个单独字符。
处理词库文件 ./wubi98_S.dict.yaml 完毕, 共处理 73011 行, 64295 个项已经存在于新世纪五笔码表中, 保存了 8715 行,舍弃了 1 个单独字符。
处理词库文件 ./wubi98_U.dict.yaml 完毕, 共处理 100477 行, 32351 个项已经存在于新世纪五笔码表中, 保存了 0 行,舍弃了 68126 个单独字符。
新世纪五笔原有 112061 条,新添加了 16072 条,整合后为 128133 条,从98五笔词库中提取了 24762 条词语(未查重),现写入 128133 条词语。
程序用时:0.481324s
|