本文记录了在创新工场实习过程中对药物说明书进行操作的代码,全部是进行规则匹配和正则字符串没什么技术含量,仅做个记录方便搬运
# 处理爬取的药物说明书数据集时使用的代码,药物说明书链接:https://pan.baidu.com/s/1v1puWYLY8lDnbKKEPVTFgg 密码:jjw7
import re
import pandas as pd
f = open('./药品说明书(除西药)/健客.txt','r')
data = []
num = 0
# print(clounms)
#for line in f.readlines():
#line = re.sub('\[|\]|\'|"',string=line,repl='')
#list = line.split('\t')
#line_data = {}
#jingji = ''
# yaopingdizhi = list[0]
# line_data['药品地址'] = yaopingdizhi
# print(yaopingdizhi)
# yaopingming = list[1].split(' ')[0].replace('价格','')# 药品名称在后面一并处理
# line_data['药品名_药盒'] = yaopingming
# # print(line_data['药品名_药盒'])
#
# list2 = list[2].split(',')
# if len(list2) != 2:# 非药品有两个标签,药品有三个标签,非药品没有是否处方药
# chufangyao = ''
# yaopingleibie = ''
# yibao = ''
# else:
# chufangyao = ''
# yaopingleibie = list2[0]
# yibao = list2[1]
# line_data['是否处方药'] = chufangyao
# line_data['药品类别'] =yaopingleibie
# line_data['是否医保'] = yibao
#
#
#
# # print(list[3])
# jiage = float(re.sub('¥|,',string=list[3],repl=''))
# line_data['价格'] = jiage
# # print(jiage)
#
#
# pizhunwenhao = re.sub('批准文号:',string = list[4],repl='').replace(' ','').replace('国药准字','').replace('国药准','').replace('注册证号','').replace('医疗机构制剂','').replace('国准字','').replace('该药已做变更','')
# pizhunwenhao = re.sub('国?药?准?字?',string = pizhunwenhao,repl='').replace('食健','').replace('+','/').replace('京制','')
# # if not re.match(pattern='.*[A-Z]{1,}[0-9]{6,}.*',string=pizhunwenhao):
# # pizhunwenhao=''
# line_data['批准文号'] = pizhunwenhao
# # print('批准文号',pizhunwenhao)
# #
# yaopingshuomingcanshu = list[5].strip().split(',')
#
# for i in yaopingshuomingcanshu:
# if '药品名称:' in i :
# yaopingming_shuomingshu = re.sub(pattern='汉语拼音.*',string=i,repl='').replace(':','').replace(':','')
# yaopingming_shuomingshu = re.sub(pattern='英文名.*',string=i,repl='')
# yaopingming_shuomingshu = re.sub(pattern='曾用名.*',string=i,repl='')
# yaopingming_shuomingshu = re.sub(pattern='药品名称:?:?',string=i,repl='')
# if yaopingming_shuomingshu == yaopingming:
# yaopingming_shuomingshu = ''
# line_data['药品名_说明书'] = yaopingming_shuomingshu
# # print(yaopingming_shuomingshu)
# #
# #
# if '药品规格:' in i :
# yaopingguige = i.replace('药品规格:', '')
# line_data['药品规格'] = yaopingguige
# #print(yaopingguige)
#
# if '药品单位' in i :
# yaopingdanwei = re.sub('生产厂家:.*',string=i,repl='').replace('药品单位:','').replace('/盒','')
# yaopingdanwei = re.sub('大|中|小',string=yaopingdanwei,repl='')
# line_data['药品单位'] = yaopingdanwei
# #print(yaopingdanwei)
#
# if '生产厂家' in i:
# shenchanchangjia = re.sub('.*生产厂家[:是]',string=i,repl='')
# shenchanchangjia = re.sub(',.*',string=shenchanchangjia,repl='')
# # print(shenchanchangjia)
# line_data['生产厂家'] = shenchanchangjia
# # print(shenchanchangjia)
#
#
# if '主治疾病' in i :
# zhuzhijibing = i.replace('主治疾病:','')
# # print(i)
# line_data['主治疾病'] = zhuzhijibing
# # print(zhuzhijibing)
#
#
# if '医师建议' and '注册证号' in i :
# # print('医师建议与注册证号',i)
# try:
# xiugaizuhcezhenghao = i.split('原注册证号')[0]
# except:
# pass
# xiugaizuhcezhenghao = re.sub(pattern='医师建议:注册证号',string=i,repl='')
# xiugaizuhcezhenghao = re.sub('国药准字',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub('批准文号:?',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub(pattern='[注]?册证号[:]?',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub(pattern='注*册证号',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub('原注册证号.*',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub('分包装.*',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub('公司名称.*',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub('原.*',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub('产品.*',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub(':|:|备注|医师建议',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub('医师建议注?册?证?号?',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub('&.*',string=xiugaizuhcezhenghao,repl='')
# if re.match(pattern='[A-Z]+[0-9]*',string=xiugaizuhcezhenghao):
# if xiugaizuhcezhenghao == pizhunwenhao:
# # print('批准证号未修改',pizhunwenhao,xiugaizuhcezhenghao)
# xiugaizuhcezhenghao = ''
# line_data['修改之后的批准文号'] = xiugaizuhcezhenghao
# else:
# line_data['修改之后的批准文号'] = ''
# # print(line_data['修改之后的批准文号'])
# elif '医师建议' in i :
# yishijianyi = re.sub(pattern='.*医师建议:?',string=i,repl='')
# # print(i)
# line_data['医师建议'] = yishijianyi
# #print(line_data['医师建议'])
#
# #
# yaopingxiangxishuomingshu = list[6].strip().split(',')
# for j in yaopingxiangxishuomingshu:
# if '药品名称' in j :
# yaopingming_xiangxishuomingshu = re.sub(pattern='.*药品名称?:?:?',string=j,repl='').replace(':','').replace(':','')
# line_data['药品名_详细说明书']= yaopingming_xiangxishuomingshu
# # print(line_data['药品名_详细说明书'])
# #
# if '商品名/商标' in j:
# shangbiao = re.sub(pattern='商品名/商标:',string=j,repl='')
# line_data['商标'] = shangbiao
# # print(shangbiao)
# #
#
# if '规格' in j :
# guige_xiangxishuomingshu = re.sub(pattern='规格:',string=j,repl='')
# if len(guige_xiangxishuomingshu) >50:
# #print(guige_xiangxishuomingshu)
# guige_xiangxishuomingshu = ''
# if guige_xiangxishuomingshu!= yaopingguige:
# line_data['详细说明书中的药品规格'] = guige_xiangxishuomingshu
# #print('前后规格不一致',yaopingguige,guige_xiangxishuomingshu)
# #
# #
# if '适应症' in j :
# shiyingzheng = re.sub(pattern='适应症:?:?',string=j,repl='')
# line_data['适应症'] = shiyingzheng
# #print(shiyingzheng)
# if '用法用量' in j :
# yongfayongliang = re.sub(pattern='用法用量:?:?',string=j,repl='')
# line_data['用法用量'] = yongfayongliang
# #print(yongfayongliang)
# #
# if '不良反应:' in j :
# buliangfanying = re.sub(pattern='不良反应:?:?',string=j,repl='')
# line_data['不良反应'] = buliangfanying
# #print(buliangfanying)
# # # 药品详细说明书(药品名,商标,规格,适应症,用法用量,不良反应,禁忌,注意事项,药理毒理,生产厂家,批准文号,生产地址,条形码)']
# if '禁忌' in j:
# jingji = re.sub(pattern='禁忌:?:?',string=j,repl='')
# line_data['禁忌'] = jingji
# #print(jingji)
# if '注意事项' in j :
# zhuyishixiang = re.sub(pattern='注意事项:?:?',string=j,repl='')
# line_data['注意事项'] = zhuyishixiang
# if jingji == zhuyishixiang:
# line_data['注意事项'] = ''
# # print(zhuyishixiang)
# line_data['药理毒理'] = ''
# if '药理毒理' in j :
# bingliduli = re.sub(pattern='药理毒理:',string=j,repl='')
# line_data['药理毒理'] = bingliduli
# if '生产厂家' in j :
# shengchanchagnjia = re.sub(pattern='生产厂家:?',string=j,repl='')
# line_data['生产厂家'] = shengchanchagnjia
#
# if '生产地址' in j :
# shengchandizhi = re.sub(pattern='生产地址:',string=j,repl='')
# line_data['生产地址'] = shengchandizhi
# if '条形码' in j :
# tiaoxingma = re.sub(pattern='条形码:?',string=j,repl='')
# if not re.match(pattern='.*[0-9]{7,}',string=tiaoxingma):
# # print('错误',tiaoxingma,j)
# tiaoxingma = ''
# line_data['条形码'] = tiaoxingma
# #
# #
# #
# #
# if yaopingming_xiangxishuomingshu == yaopingming_shuomingshu and yaopingming_xiangxishuomingshu == yaopingming and yaopingming == yaopingming_shuomingshu:
# # print('前中后一致')
# pass
# elif yaopingming_xiangxishuomingshu == yaopingming:
# # print('前后相等')
# line_data['药品名_详细说明书'] = ''
# elif yaopingming_xiangxishuomingshu == yaopingming_shuomingshu:
# # print('中后相等')
# line_data['药品名_详细说明书'] = ''
# else:
# pass
# # print('都不相等',yaopingming,yaopingming_shuomingshu,yaopingming_xiangxishuomingshu)
#
# # print(yaopingming)
# # print(yaopingming_shuomingshu)
# # print(yaopingming_xiangxishuomingshu,'\n\n\n')
# flag = 0
# fenges = ['商品名', '商品名称', '通用名', '通用名称', '汉语拼音', '英文名称', '成份', '英文名', '曾用名','剂型','本品主要成分','主要成分','成分','本品']
# for fenge in fenges:
# if fenge in yaopingming_shuomingshu:
# flag = 1
# break
# if flag == 1:
# yaopingming_shuomingshu = re.sub(pattern='医师建议:?:?',string=yaopingming_shuomingshu,repl='')
# if '通用名称' or '通用名' in yaopingming_shuomingshu:
# # print(yaopingming_shuomingshu)
# tongyongming = re.sub(pattern='.*通用名称?:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','').replace('品名','')
# # print(tongyongming)
# # print(tongyongming)
# # tongyongming1=''
# for t in fenges:
# if t in tongyongming:
# # print(t)
# tongyongming = tongyongming.split(t)[0]
# # print(tongyongming)
# # break
# # print(tongyongming)
# line_data['通用名'] = tongyongming
# # print(line_data['通用名'])
#
# if '商品名称' or '商品名' in yaopingming_shuomingshu:
# shangpinmingcheng = re.sub(pattern='.*商品名称?:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','').replace('品名','')
# # print(tongyongming)
# # tongyongming1=''
# for t in fenges:
# if t in shangpinmingcheng:
# # print(t)
# shangpinmingcheng = shangpinmingcheng.split(t)[0]
# # break
# line_data['商品名称'] = shangpinmingcheng
#
# if '汉语拼音' in yaopingming_shuomingshu:
# hanyupinyin = re.sub(pattern='.*汉语拼音:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','')
# # print(tongyongming)
# # tongyongming1=''
# for t in fenges:
# if t in hanyupinyin:
# # print(t)
# hanyupinyin = hanyupinyin.split(t)[0]
# # break
# line_data['汉语拼音'] = hanyupinyin
# line_data['英文名称'] = ''
# if '英文名称' or '英文名' in yaopingming_shuomingshu:
# # print(yaopingming_shuomingshu)
# yingwenmincheng = re.sub(pattern='.*英文名称?:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','')
# # print(tongyongming)
# # tongyongming1=''
# for t in fenges:
# if t in yingwenmincheng:
# # print(t)
# yingwenmincheng = yingwenmincheng.split(t)[0]
# # break
# if re.match(pattern='[a-zA-Z]*',string=yingwenmincheng):
# # print(yingwenmincheng)
# line_data['英文名称'] = yingwenmincheng
# else:
# line_data['英文名称'] = ''
#
# line_data['曾用名'] = ''
# if '曾用名' in yaopingming_shuomingshu:
# cengyongming = re.sub(pattern='.*曾用名次?:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','')
# # print(tongyongming)
# # tongyongming1=''
# for t in fenges:
# if t in cengyongming:
# # print(t)
# cengyongming = cengyongming.split(t)[0]
# # break
# line_data['曾用名'] = cengyongming
#
# line_data['剂型'] = ''
# if '剂型' in yaopingming_shuomingshu:
# jixing = re.sub(pattern='.*剂型:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','')
# # print(tongyongming)
# # tongyongming1=''
# for t in fenges:
# if t in jixing:
# # print(t)
# jixing = jixing.split(t)[0]
# # print(jixing)
# # break
# line_data['剂型'] = jixing
# # print(jixing)
#
# data.append(line_data)
# df = pd.DataFrame(data)
# df = df.fillna('')
# jishu =0
# clounms = ['药品名_药盒','药品名_说明书','药品名_详细说明书','通用名','曾用名','商品名称','英文名称','汉语拼音','剂型','是否处方药','药品类别','是否医保','价格','批准文号','修改之后的批准文号',
# '药品规格','详细说明书中的药品规格','药品单位','用法用量','适应症','主治疾病','不良反应','禁忌','注意事项','药理毒理','医师建议','生产地址','生产厂家','药品地址','条形码','商标']
# df = df[clounms]
# df.to_csv('健客.csv',index=False)
# 将得到的药品说明书进行去重,得到所有的药品样品数量
import pandas as pd
import _thread
ff = pd.read_csv(open('中成药.csv','r',encoding='utf-8'))
list=[]
A =['药品名_药盒','药品名_详细说明书','通用名','曾用名','商品名称','英文名称','汉语拼音']
kong = set(ff[A].loc[0])&set(ff[A].loc[1])
f = ff.drop_duplicates(['药品名_药盒']).reset_index(drop=True)
changdu = int(f.shape[0])
def f1():
for i in range(0, changdu // 4):
if i not in list:
for j in range(f.shape[0]):
chongfu = set(f[A].loc[i]) & set(f[A].loc[j])
if chongfu != kong and chongfu != set():
print(set(f[A].loc[i]), set(f[A].loc[j]), set(f[A].loc[i]) & set(f[A].loc[j]))
list.append(j)
def f2():
for i in range(changdu // 4, changdu // 4 * 2):
if i not in list:
for j in range(f.shape[0]):
chongfu = set(f[A].loc[i]) & set(f[A].loc[j])
if chongfu != kong and chongfu != set():
print(set(f[A].loc[i]), set(f[A].loc[j]), set(f[A].loc[i]) & set(f[A].loc[j]))
list.append(j)
def f3():
for i in range(changdu // 4 * 2, changdu // 4 * 3):
if i not in list:
for j in range(f.shape[0]):
chongfu = set(f[A].loc[i]) & set(f[A].loc[j])
if chongfu != kong and chongfu != set():
print(set(f[A].loc[i]), set(f[A].loc[j]), set(f[A].loc[i]) & set(f[A].loc[j]))
list.append(j)
def f4():
for i in range(changdu // 4 * 3, changdu):
if i not in list:
for j in range(f.shape[0]):
chongfu = set(f[A].loc[i]) & set(f[A].loc[j])
if chongfu != kong and chongfu != set():
print(set(f[A].loc[i]), set(f[A].loc[j]), set(f[A].loc[i]) & set(f[A].loc[j]))
list.append(j)
try:
_thread.start_new_thread(f1())
_thread.start_new_thread(f2())
_thread.start_new_thread(f3())
_thread.start_new_thread(f4())
except:
print('错误')
f.drop(list)
f.to_csv('删除之后的中成药_{}.csv'.format(f.shape[0]),encoding='utf-8')
print(list)