创新工场—处理药物数据中使用过的代码

本文记录了在创新工场实习过程中对药物说明书进行操作的代码,全部是进行规则匹配和正则字符串没什么技术含量,仅做个记录方便搬运

# 处理爬取的药物说明书数据集时使用的代码,药物说明书链接:https://pan.baidu.com/s/1v1puWYLY8lDnbKKEPVTFgg  密码:jjw7
import re
import pandas as pd
f = open('./药品说明书(除西药)/健客.txt','r')
data = []

num = 0
# print(clounms)
#for line in f.readlines():
    #line = re.sub('\[|\]|\'|"',string=line,repl='')
    #list = line.split('\t')
    #line_data = {}
    #jingji = ''
    
    
    # yaopingdizhi  = list[0]
    # line_data['药品地址'] = yaopingdizhi
    # print(yaopingdizhi)
    
    # yaopingming = list[1].split(' ')[0].replace('价格','')# 药品名称在后面一并处理
    # line_data['药品名_药盒'] = yaopingming
    # # print(line_data['药品名_药盒'])
    #
#     list2 = list[2].split(',')
#     if len(list2) != 2:# 非药品有两个标签,药品有三个标签,非药品没有是否处方药
#         chufangyao = ''
#         yaopingleibie = ''
#         yibao = ''
#     else:
#         chufangyao = ''
#         yaopingleibie = list2[0]
#         yibao = list2[1]
#     line_data['是否处方药'] = chufangyao
#     line_data['药品类别'] =yaopingleibie
#     line_data['是否医保'] = yibao
#
#
#
#     # print(list[3])
#     jiage = float(re.sub('¥|,',string=list[3],repl=''))
#     line_data['价格'] = jiage
#     # print(jiage)
#
#
#     pizhunwenhao = re.sub('批准文号:',string = list[4],repl='').replace(' ','').replace('国药准字','').replace('国药准','').replace('注册证号','').replace('医疗机构制剂','').replace('国准字','').replace('该药已做变更','')
#     pizhunwenhao = re.sub('国?药?准?字?',string = pizhunwenhao,repl='').replace('食健','').replace('+','/').replace('京制','')
#     # if not re.match(pattern='.*[A-Z]{1,}[0-9]{6,}.*',string=pizhunwenhao):
#     #     pizhunwenhao=''
#     line_data['批准文号'] = pizhunwenhao
#     # print('批准文号',pizhunwenhao)
# #
#     yaopingshuomingcanshu  = list[5].strip().split(',')
#
#     for i in yaopingshuomingcanshu:
#         if '药品名称:' in i :
#             yaopingming_shuomingshu = re.sub(pattern='汉语拼音.*',string=i,repl='').replace(':','').replace(':','')
#             yaopingming_shuomingshu = re.sub(pattern='英文名.*',string=i,repl='')
#             yaopingming_shuomingshu = re.sub(pattern='曾用名.*',string=i,repl='')
#             yaopingming_shuomingshu = re.sub(pattern='药品名称:?:?',string=i,repl='')
#             if yaopingming_shuomingshu == yaopingming:
#                 yaopingming_shuomingshu = ''
#             line_data['药品名_说明书'] = yaopingming_shuomingshu
#             # print(yaopingming_shuomingshu)
# #
# #
#         if '药品规格:' in i :
#             yaopingguige = i.replace('药品规格:', '')
#             line_data['药品规格'] = yaopingguige
#             #print(yaopingguige)
#
#         if '药品单位' in i :
#             yaopingdanwei = re.sub('生产厂家:.*',string=i,repl='').replace('药品单位:','').replace('/盒','')
#             yaopingdanwei = re.sub('大|中|小',string=yaopingdanwei,repl='')
#             line_data['药品单位'] = yaopingdanwei
#             #print(yaopingdanwei)
#
#         if '生产厂家' in i:
#             shenchanchangjia = re.sub('.*生产厂家[:是]',string=i,repl='')
#             shenchanchangjia = re.sub(',.*',string=shenchanchangjia,repl='')
#             # print(shenchanchangjia)
#             line_data['生产厂家'] = shenchanchangjia
#             # print(shenchanchangjia)
#
#
#         if '主治疾病' in i :
#             zhuzhijibing = i.replace('主治疾病:','')
#             # print(i)
#             line_data['主治疾病'] = zhuzhijibing
#             # print(zhuzhijibing)
#
#
#         if '医师建议' and '注册证号'  in i :
#             # print('医师建议与注册证号',i)
#             try:
#                 xiugaizuhcezhenghao = i.split('原注册证号')[0]
#             except:
#                 pass
#             xiugaizuhcezhenghao = re.sub(pattern='医师建议:注册证号',string=i,repl='')
#             xiugaizuhcezhenghao = re.sub('国药准字',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub('批准文号:?',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub(pattern='[注]?册证号[:]?',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub(pattern='注*册证号',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub('原注册证号.*',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub('分包装.*',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub('公司名称.*',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub('原.*',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub('产品.*',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub(':|:|备注|医师建议',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub('医师建议注?册?证?号?',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub('&.*',string=xiugaizuhcezhenghao,repl='')
#             if re.match(pattern='[A-Z]+[0-9]*',string=xiugaizuhcezhenghao):
#                 if xiugaizuhcezhenghao == pizhunwenhao:
#                     # print('批准证号未修改',pizhunwenhao,xiugaizuhcezhenghao)
#                     xiugaizuhcezhenghao = ''
#                 line_data['修改之后的批准文号'] = xiugaizuhcezhenghao
#             else:
#                 line_data['修改之后的批准文号'] = ''
#             # print(line_data['修改之后的批准文号'])
#         elif '医师建议'  in i :
#             yishijianyi = re.sub(pattern='.*医师建议:?',string=i,repl='')
#             # print(i)
#             line_data['医师建议'] = yishijianyi
#             #print(line_data['医师建议'])
#
# #
#     yaopingxiangxishuomingshu = list[6].strip().split(',')
#     for j in yaopingxiangxishuomingshu:
#         if '药品名称' in j :
#             yaopingming_xiangxishuomingshu = re.sub(pattern='.*药品名称?:?:?',string=j,repl='').replace(':','').replace(':','')
#             line_data['药品名_详细说明书']= yaopingming_xiangxishuomingshu
#             # print(line_data['药品名_详细说明书'])
# #
#         if '商品名/商标'  in j:
#             shangbiao = re.sub(pattern='商品名/商标:',string=j,repl='')
#             line_data['商标'] = shangbiao
#             # print(shangbiao)
# #
#
#         if '规格' in j :
#             guige_xiangxishuomingshu  = re.sub(pattern='规格:',string=j,repl='')
#             if len(guige_xiangxishuomingshu) >50:
#                 #print(guige_xiangxishuomingshu)
#                 guige_xiangxishuomingshu = ''
#             if guige_xiangxishuomingshu!= yaopingguige:
#                 line_data['详细说明书中的药品规格'] = guige_xiangxishuomingshu
#                 #print('前后规格不一致',yaopingguige,guige_xiangxishuomingshu)
# #
# #
#         if '适应症' in j :
#             shiyingzheng  = re.sub(pattern='适应症:?:?',string=j,repl='')
#             line_data['适应症'] = shiyingzheng
#             #print(shiyingzheng)
#         if '用法用量' in j :
#             yongfayongliang = re.sub(pattern='用法用量:?:?',string=j,repl='')
#             line_data['用法用量'] = yongfayongliang
#             #print(yongfayongliang)
# #
#         if '不良反应:' in j :
#             buliangfanying = re.sub(pattern='不良反应:?:?',string=j,repl='')
#             line_data['不良反应'] = buliangfanying
#             #print(buliangfanying)
# #     # 药品详细说明书(药品名,商标,规格,适应症,用法用量,不良反应,禁忌,注意事项,药理毒理,生产厂家,批准文号,生产地址,条形码)']
#         if '禁忌' in  j:
#             jingji = re.sub(pattern='禁忌:?:?',string=j,repl='')
#             line_data['禁忌'] = jingji
#             #print(jingji)
#         if '注意事项' in j :
#             zhuyishixiang  = re.sub(pattern='注意事项:?:?',string=j,repl='')
#             line_data['注意事项'] = zhuyishixiang
#             if jingji == zhuyishixiang:
#                 line_data['注意事项'] = ''
#             # print(zhuyishixiang)
#         line_data['药理毒理'] = ''
#         if '药理毒理'  in j :
#             bingliduli = re.sub(pattern='药理毒理:',string=j,repl='')
#             line_data['药理毒理'] = bingliduli
#         if '生产厂家' in j :
#             shengchanchagnjia = re.sub(pattern='生产厂家:?',string=j,repl='')
#             line_data['生产厂家'] = shengchanchagnjia
#
#         if '生产地址' in j :
#             shengchandizhi = re.sub(pattern='生产地址:',string=j,repl='')
#             line_data['生产地址'] = shengchandizhi
#         if '条形码' in j :
#             tiaoxingma = re.sub(pattern='条形码:?',string=j,repl='')
#             if not re.match(pattern='.*[0-9]{7,}',string=tiaoxingma):
#                 # print('错误',tiaoxingma,j)
#                 tiaoxingma = ''
#             line_data['条形码'] = tiaoxingma
# #
# #
# #
# #
#     if yaopingming_xiangxishuomingshu == yaopingming_shuomingshu and yaopingming_xiangxishuomingshu == yaopingming and yaopingming == yaopingming_shuomingshu:
#         # print('前中后一致')
#         pass
#     elif yaopingming_xiangxishuomingshu == yaopingming:
#         # print('前后相等')
#         line_data['药品名_详细说明书'] = ''
#     elif yaopingming_xiangxishuomingshu == yaopingming_shuomingshu:
#         # print('中后相等')
#         line_data['药品名_详细说明书'] = ''
#     else:
#         pass
#         # print('都不相等',yaopingming,yaopingming_shuomingshu,yaopingming_xiangxishuomingshu)
#
#     # print(yaopingming)
#     # print(yaopingming_shuomingshu)
#     # print(yaopingming_xiangxishuomingshu,'\n\n\n')
#     flag = 0
#     fenges = ['商品名', '商品名称', '通用名', '通用名称', '汉语拼音', '英文名称', '成份', '英文名', '曾用名','剂型','本品主要成分','主要成分','成分','本品']
#     for fenge in fenges:
#         if fenge in yaopingming_shuomingshu:
#             flag = 1
#             break
#     if flag == 1:
#         yaopingming_shuomingshu = re.sub(pattern='医师建议:?:?',string=yaopingming_shuomingshu,repl='')
#         if '通用名称' or '通用名' in yaopingming_shuomingshu:
#             # print(yaopingming_shuomingshu)
#             tongyongming = re.sub(pattern='.*通用名称?:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','').replace('品名','')
#             # print(tongyongming)
#             # print(tongyongming)
#             # tongyongming1=''
#             for t in fenges:
#                 if t in tongyongming:
#                     # print(t)
#                     tongyongming = tongyongming.split(t)[0]
#                     # print(tongyongming)
#                     # break
#             # print(tongyongming)
#             line_data['通用名'] = tongyongming
#             # print(line_data['通用名'])
#
#         if '商品名称' or '商品名' in yaopingming_shuomingshu:
#             shangpinmingcheng = re.sub(pattern='.*商品名称?:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','').replace('品名','')
#             # print(tongyongming)
#             # tongyongming1=''
#             for t in fenges:
#                 if t in shangpinmingcheng:
#                     # print(t)
#                     shangpinmingcheng = shangpinmingcheng.split(t)[0]
#                     # break
#             line_data['商品名称'] = shangpinmingcheng
#
#         if '汉语拼音' in yaopingming_shuomingshu:
#             hanyupinyin = re.sub(pattern='.*汉语拼音:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','')
#             # print(tongyongming)
#             # tongyongming1=''
#             for t in fenges:
#                 if t in hanyupinyin:
#                     # print(t)
#                     hanyupinyin = hanyupinyin.split(t)[0]
#                     # break
#             line_data['汉语拼音'] = hanyupinyin
#         line_data['英文名称'] = ''
#         if '英文名称' or '英文名' in yaopingming_shuomingshu:
#             # print(yaopingming_shuomingshu)
#             yingwenmincheng = re.sub(pattern='.*英文名称?:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','')
#             # print(tongyongming)
#             # tongyongming1=''
#             for t in fenges:
#                 if t in yingwenmincheng:
#                     # print(t)
#                     yingwenmincheng = yingwenmincheng.split(t)[0]
#                     # break
#             if re.match(pattern='[a-zA-Z]*',string=yingwenmincheng):
#                 # print(yingwenmincheng)
#                 line_data['英文名称'] = yingwenmincheng
#             else:
#                 line_data['英文名称'] = ''
#
#         line_data['曾用名'] = ''
#         if '曾用名' in yaopingming_shuomingshu:
#             cengyongming = re.sub(pattern='.*曾用名次?:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','')
#             # print(tongyongming)
#             # tongyongming1=''
#             for t in fenges:
#                 if t in cengyongming:
#                     # print(t)
#                     cengyongming = cengyongming.split(t)[0]
#                     # break
#             line_data['曾用名'] = cengyongming
#
#         line_data['剂型'] = ''
#         if '剂型' in yaopingming_shuomingshu:
#             jixing = re.sub(pattern='.*剂型:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','')
#             # print(tongyongming)
#             # tongyongming1=''
#             for t in fenges:
#                 if t in jixing:
#                     # print(t)
#                     jixing = jixing.split(t)[0]
#                     # print(jixing)
#                     # break
#             line_data['剂型'] = jixing
#             # print(jixing)
#
#     data.append(line_data)
# df = pd.DataFrame(data)
# df = df.fillna('')
# jishu =0
# clounms = ['药品名_药盒','药品名_说明书','药品名_详细说明书','通用名','曾用名','商品名称','英文名称','汉语拼音','剂型','是否处方药','药品类别','是否医保','价格','批准文号','修改之后的批准文号',
#            '药品规格','详细说明书中的药品规格','药品单位','用法用量','适应症','主治疾病','不良反应','禁忌','注意事项','药理毒理','医师建议','生产地址','生产厂家','药品地址','条形码','商标']
# df = df[clounms]
# df.to_csv('健客.csv',index=False)
# 将得到的药品说明书进行去重,得到所有的药品样品数量
import pandas as pd
import _thread
ff = pd.read_csv(open('中成药.csv','r',encoding='utf-8'))
list=[]
A =['药品名_药盒','药品名_详细说明书','通用名','曾用名','商品名称','英文名称','汉语拼音']
kong = set(ff[A].loc[0])&set(ff[A].loc[1])
f = ff.drop_duplicates(['药品名_药盒']).reset_index(drop=True)
changdu = int(f.shape[0])
def f1():
    for i in range(0, changdu // 4):
        if i not in list:
            for j in range(f.shape[0]):
                chongfu = set(f[A].loc[i]) & set(f[A].loc[j])
                if chongfu != kong and chongfu != set():
                    print(set(f[A].loc[i]), set(f[A].loc[j]), set(f[A].loc[i]) & set(f[A].loc[j]))
                    list.append(j)
def f2():
    for i in range(changdu // 4, changdu // 4 * 2):
        if i not in list:
            for j in range(f.shape[0]):
                chongfu = set(f[A].loc[i]) & set(f[A].loc[j])
                if chongfu != kong and chongfu != set():
                    print(set(f[A].loc[i]), set(f[A].loc[j]), set(f[A].loc[i]) & set(f[A].loc[j]))
                    list.append(j)
def f3():
    for i in range(changdu // 4 * 2, changdu // 4 * 3):
        if i not in list:
            for j in range(f.shape[0]):
                chongfu = set(f[A].loc[i]) & set(f[A].loc[j])
                if chongfu != kong and chongfu != set():
                    print(set(f[A].loc[i]), set(f[A].loc[j]), set(f[A].loc[i]) & set(f[A].loc[j]))
                    list.append(j)
def f4():
    for i in range(changdu // 4 * 3, changdu):
        if i not in list:
            for j in range(f.shape[0]):
                chongfu = set(f[A].loc[i]) & set(f[A].loc[j])
                if chongfu != kong and chongfu != set():
                    print(set(f[A].loc[i]), set(f[A].loc[j]), set(f[A].loc[i]) & set(f[A].loc[j]))
                    list.append(j)

try:
    _thread.start_new_thread(f1())
    _thread.start_new_thread(f2())
    _thread.start_new_thread(f3())
    _thread.start_new_thread(f4())
except:
    print('错误')

f.drop(list)
f.to_csv('删除之后的中成药_{}.csv'.format(f.shape[0]),encoding='utf-8')
print(list)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章