創新工場—處理藥物數據中使用過的代碼

本文記錄了在創新工場實習過程中對藥物說明書進行操作的代碼,全部是進行規則匹配和正則字符串沒什麼技術含量,僅做個記錄方便搬運

# 處理爬取的藥物說明書數據集時使用的代碼,藥物說明書鏈接:https://pan.baidu.com/s/1v1puWYLY8lDnbKKEPVTFgg  密碼:jjw7
import re
import pandas as pd
f = open('./藥品說明書(除西藥)/健客.txt','r')
data = []

num = 0
# print(clounms)
#for line in f.readlines():
    #line = re.sub('\[|\]|\'|"',string=line,repl='')
    #list = line.split('\t')
    #line_data = {}
    #jingji = ''
    
    
    # yaopingdizhi  = list[0]
    # line_data['藥品地址'] = yaopingdizhi
    # print(yaopingdizhi)
    
    # yaopingming = list[1].split(' ')[0].replace('價格','')# 藥品名稱在後面一併處理
    # line_data['藥品名_藥盒'] = yaopingming
    # # print(line_data['藥品名_藥盒'])
    #
#     list2 = list[2].split(',')
#     if len(list2) != 2:# 非藥品有兩個標籤,藥品有三個標籤,非藥品沒有是否處方藥
#         chufangyao = ''
#         yaopingleibie = ''
#         yibao = ''
#     else:
#         chufangyao = ''
#         yaopingleibie = list2[0]
#         yibao = list2[1]
#     line_data['是否處方藥'] = chufangyao
#     line_data['藥品類別'] =yaopingleibie
#     line_data['是否醫保'] = yibao
#
#
#
#     # print(list[3])
#     jiage = float(re.sub('¥|,',string=list[3],repl=''))
#     line_data['價格'] = jiage
#     # print(jiage)
#
#
#     pizhunwenhao = re.sub('批准文號:',string = list[4],repl='').replace(' ','').replace('國藥準字','').replace('國藥準','').replace('註冊證號','').replace('醫療機構製劑','').replace('國準字','').replace('該藥已做變更','')
#     pizhunwenhao = re.sub('國?藥?準?字?',string = pizhunwenhao,repl='').replace('食健','').replace('+','/').replace('京制','')
#     # if not re.match(pattern='.*[A-Z]{1,}[0-9]{6,}.*',string=pizhunwenhao):
#     #     pizhunwenhao=''
#     line_data['批准文號'] = pizhunwenhao
#     # print('批准文號',pizhunwenhao)
# #
#     yaopingshuomingcanshu  = list[5].strip().split(',')
#
#     for i in yaopingshuomingcanshu:
#         if '藥品名稱:' in i :
#             yaopingming_shuomingshu = re.sub(pattern='漢語拼音.*',string=i,repl='').replace(':','').replace(':','')
#             yaopingming_shuomingshu = re.sub(pattern='英文名.*',string=i,repl='')
#             yaopingming_shuomingshu = re.sub(pattern='曾用名.*',string=i,repl='')
#             yaopingming_shuomingshu = re.sub(pattern='藥品名稱:?:?',string=i,repl='')
#             if yaopingming_shuomingshu == yaopingming:
#                 yaopingming_shuomingshu = ''
#             line_data['藥品名_說明書'] = yaopingming_shuomingshu
#             # print(yaopingming_shuomingshu)
# #
# #
#         if '藥品規格:' in i :
#             yaopingguige = i.replace('藥品規格:', '')
#             line_data['藥品規格'] = yaopingguige
#             #print(yaopingguige)
#
#         if '藥品單位' in i :
#             yaopingdanwei = re.sub('生產廠家:.*',string=i,repl='').replace('藥品單位:','').replace('/盒','')
#             yaopingdanwei = re.sub('大|中|小',string=yaopingdanwei,repl='')
#             line_data['藥品單位'] = yaopingdanwei
#             #print(yaopingdanwei)
#
#         if '生產廠家' in i:
#             shenchanchangjia = re.sub('.*生產廠家[:是]',string=i,repl='')
#             shenchanchangjia = re.sub(',.*',string=shenchanchangjia,repl='')
#             # print(shenchanchangjia)
#             line_data['生產廠家'] = shenchanchangjia
#             # print(shenchanchangjia)
#
#
#         if '主治疾病' in i :
#             zhuzhijibing = i.replace('主治疾病:','')
#             # print(i)
#             line_data['主治疾病'] = zhuzhijibing
#             # print(zhuzhijibing)
#
#
#         if '醫師建議' and '註冊證號'  in i :
#             # print('醫師建議與註冊證號',i)
#             try:
#                 xiugaizuhcezhenghao = i.split('原註冊證號')[0]
#             except:
#                 pass
#             xiugaizuhcezhenghao = re.sub(pattern='醫師建議:註冊證號',string=i,repl='')
#             xiugaizuhcezhenghao = re.sub('國藥準字',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub('批准文號:?',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub(pattern='[注]?冊證號[:]?',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub(pattern='注*冊證號',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub('原註冊證號.*',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub('分包裝.*',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub('公司名稱.*',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub('原.*',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub('產品.*',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub(':|:|備註|醫師建議',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub('醫師建議注?冊?證?號?',string=xiugaizuhcezhenghao,repl='')
#             xiugaizuhcezhenghao = re.sub('&.*',string=xiugaizuhcezhenghao,repl='')
#             if re.match(pattern='[A-Z]+[0-9]*',string=xiugaizuhcezhenghao):
#                 if xiugaizuhcezhenghao == pizhunwenhao:
#                     # print('批准證號未修改',pizhunwenhao,xiugaizuhcezhenghao)
#                     xiugaizuhcezhenghao = ''
#                 line_data['修改之後的批准文號'] = xiugaizuhcezhenghao
#             else:
#                 line_data['修改之後的批准文號'] = ''
#             # print(line_data['修改之後的批准文號'])
#         elif '醫師建議'  in i :
#             yishijianyi = re.sub(pattern='.*醫師建議:?',string=i,repl='')
#             # print(i)
#             line_data['醫師建議'] = yishijianyi
#             #print(line_data['醫師建議'])
#
# #
#     yaopingxiangxishuomingshu = list[6].strip().split(',')
#     for j in yaopingxiangxishuomingshu:
#         if '藥品名稱' in j :
#             yaopingming_xiangxishuomingshu = re.sub(pattern='.*藥品名稱?:?:?',string=j,repl='').replace(':','').replace(':','')
#             line_data['藥品名_詳細說明書']= yaopingming_xiangxishuomingshu
#             # print(line_data['藥品名_詳細說明書'])
# #
#         if '商品名/商標'  in j:
#             shangbiao = re.sub(pattern='商品名/商標:',string=j,repl='')
#             line_data['商標'] = shangbiao
#             # print(shangbiao)
# #
#
#         if '規格' in j :
#             guige_xiangxishuomingshu  = re.sub(pattern='規格:',string=j,repl='')
#             if len(guige_xiangxishuomingshu) >50:
#                 #print(guige_xiangxishuomingshu)
#                 guige_xiangxishuomingshu = ''
#             if guige_xiangxishuomingshu!= yaopingguige:
#                 line_data['詳細說明書中的藥品規格'] = guige_xiangxishuomingshu
#                 #print('前後規格不一致',yaopingguige,guige_xiangxishuomingshu)
# #
# #
#         if '適應症' in j :
#             shiyingzheng  = re.sub(pattern='適應症:?:?',string=j,repl='')
#             line_data['適應症'] = shiyingzheng
#             #print(shiyingzheng)
#         if '用法用量' in j :
#             yongfayongliang = re.sub(pattern='用法用量:?:?',string=j,repl='')
#             line_data['用法用量'] = yongfayongliang
#             #print(yongfayongliang)
# #
#         if '不良反應:' in j :
#             buliangfanying = re.sub(pattern='不良反應:?:?',string=j,repl='')
#             line_data['不良反應'] = buliangfanying
#             #print(buliangfanying)
# #     # 藥品詳細說明書(藥品名,商標,規格,適應症,用法用量,不良反應,禁忌,注意事項,藥理毒理,生產廠家,批准文號,生產地址,條形碼)']
#         if '禁忌' in  j:
#             jingji = re.sub(pattern='禁忌:?:?',string=j,repl='')
#             line_data['禁忌'] = jingji
#             #print(jingji)
#         if '注意事項' in j :
#             zhuyishixiang  = re.sub(pattern='注意事項:?:?',string=j,repl='')
#             line_data['注意事項'] = zhuyishixiang
#             if jingji == zhuyishixiang:
#                 line_data['注意事項'] = ''
#             # print(zhuyishixiang)
#         line_data['藥理毒理'] = ''
#         if '藥理毒理'  in j :
#             bingliduli = re.sub(pattern='藥理毒理:',string=j,repl='')
#             line_data['藥理毒理'] = bingliduli
#         if '生產廠家' in j :
#             shengchanchagnjia = re.sub(pattern='生產廠家:?',string=j,repl='')
#             line_data['生產廠家'] = shengchanchagnjia
#
#         if '生產地址' in j :
#             shengchandizhi = re.sub(pattern='生產地址:',string=j,repl='')
#             line_data['生產地址'] = shengchandizhi
#         if '條形碼' in j :
#             tiaoxingma = re.sub(pattern='條形碼:?',string=j,repl='')
#             if not re.match(pattern='.*[0-9]{7,}',string=tiaoxingma):
#                 # print('錯誤',tiaoxingma,j)
#                 tiaoxingma = ''
#             line_data['條形碼'] = tiaoxingma
# #
# #
# #
# #
#     if yaopingming_xiangxishuomingshu == yaopingming_shuomingshu and yaopingming_xiangxishuomingshu == yaopingming and yaopingming == yaopingming_shuomingshu:
#         # print('前中後一致')
#         pass
#     elif yaopingming_xiangxishuomingshu == yaopingming:
#         # print('前後相等')
#         line_data['藥品名_詳細說明書'] = ''
#     elif yaopingming_xiangxishuomingshu == yaopingming_shuomingshu:
#         # print('中後相等')
#         line_data['藥品名_詳細說明書'] = ''
#     else:
#         pass
#         # print('都不相等',yaopingming,yaopingming_shuomingshu,yaopingming_xiangxishuomingshu)
#
#     # print(yaopingming)
#     # print(yaopingming_shuomingshu)
#     # print(yaopingming_xiangxishuomingshu,'\n\n\n')
#     flag = 0
#     fenges = ['商品名', '商品名稱', '通用名', '通用名稱', '漢語拼音', '英文名稱', '成份', '英文名', '曾用名','劑型','本品主要成分','主要成分','成分','本品']
#     for fenge in fenges:
#         if fenge in yaopingming_shuomingshu:
#             flag = 1
#             break
#     if flag == 1:
#         yaopingming_shuomingshu = re.sub(pattern='醫師建議:?:?',string=yaopingming_shuomingshu,repl='')
#         if '通用名稱' or '通用名' in yaopingming_shuomingshu:
#             # print(yaopingming_shuomingshu)
#             tongyongming = re.sub(pattern='.*通用名稱?:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','').replace('品名','')
#             # print(tongyongming)
#             # print(tongyongming)
#             # tongyongming1=''
#             for t in fenges:
#                 if t in tongyongming:
#                     # print(t)
#                     tongyongming = tongyongming.split(t)[0]
#                     # print(tongyongming)
#                     # break
#             # print(tongyongming)
#             line_data['通用名'] = tongyongming
#             # print(line_data['通用名'])
#
#         if '商品名稱' or '商品名' in yaopingming_shuomingshu:
#             shangpinmingcheng = re.sub(pattern='.*商品名稱?:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','').replace('品名','')
#             # print(tongyongming)
#             # tongyongming1=''
#             for t in fenges:
#                 if t in shangpinmingcheng:
#                     # print(t)
#                     shangpinmingcheng = shangpinmingcheng.split(t)[0]
#                     # break
#             line_data['商品名稱'] = shangpinmingcheng
#
#         if '漢語拼音' in yaopingming_shuomingshu:
#             hanyupinyin = re.sub(pattern='.*漢語拼音:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','')
#             # print(tongyongming)
#             # tongyongming1=''
#             for t in fenges:
#                 if t in hanyupinyin:
#                     # print(t)
#                     hanyupinyin = hanyupinyin.split(t)[0]
#                     # break
#             line_data['漢語拼音'] = hanyupinyin
#         line_data['英文名稱'] = ''
#         if '英文名稱' or '英文名' in yaopingming_shuomingshu:
#             # print(yaopingming_shuomingshu)
#             yingwenmincheng = re.sub(pattern='.*英文名稱?:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','')
#             # print(tongyongming)
#             # tongyongming1=''
#             for t in fenges:
#                 if t in yingwenmincheng:
#                     # print(t)
#                     yingwenmincheng = yingwenmincheng.split(t)[0]
#                     # break
#             if re.match(pattern='[a-zA-Z]*',string=yingwenmincheng):
#                 # print(yingwenmincheng)
#                 line_data['英文名稱'] = yingwenmincheng
#             else:
#                 line_data['英文名稱'] = ''
#
#         line_data['曾用名'] = ''
#         if '曾用名' in yaopingming_shuomingshu:
#             cengyongming = re.sub(pattern='.*曾用名次?:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','')
#             # print(tongyongming)
#             # tongyongming1=''
#             for t in fenges:
#                 if t in cengyongming:
#                     # print(t)
#                     cengyongming = cengyongming.split(t)[0]
#                     # break
#             line_data['曾用名'] = cengyongming
#
#         line_data['劑型'] = ''
#         if '劑型' in yaopingming_shuomingshu:
#             jixing = re.sub(pattern='.*劑型:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','')
#             # print(tongyongming)
#             # tongyongming1=''
#             for t in fenges:
#                 if t in jixing:
#                     # print(t)
#                     jixing = jixing.split(t)[0]
#                     # print(jixing)
#                     # break
#             line_data['劑型'] = jixing
#             # print(jixing)
#
#     data.append(line_data)
# df = pd.DataFrame(data)
# df = df.fillna('')
# jishu =0
# clounms = ['藥品名_藥盒','藥品名_說明書','藥品名_詳細說明書','通用名','曾用名','商品名稱','英文名稱','漢語拼音','劑型','是否處方藥','藥品類別','是否醫保','價格','批准文號','修改之後的批准文號',
#            '藥品規格','詳細說明書中的藥品規格','藥品單位','用法用量','適應症','主治疾病','不良反應','禁忌','注意事項','藥理毒理','醫師建議','生產地址','生產廠家','藥品地址','條形碼','商標']
# df = df[clounms]
# df.to_csv('健客.csv',index=False)
# 將得到的藥品說明書進行去重,得到所有的藥品樣品數量
import pandas as pd
import _thread
ff = pd.read_csv(open('中成藥.csv','r',encoding='utf-8'))
list=[]
A =['藥品名_藥盒','藥品名_詳細說明書','通用名','曾用名','商品名稱','英文名稱','漢語拼音']
kong = set(ff[A].loc[0])&set(ff[A].loc[1])
f = ff.drop_duplicates(['藥品名_藥盒']).reset_index(drop=True)
changdu = int(f.shape[0])
def f1():
    for i in range(0, changdu // 4):
        if i not in list:
            for j in range(f.shape[0]):
                chongfu = set(f[A].loc[i]) & set(f[A].loc[j])
                if chongfu != kong and chongfu != set():
                    print(set(f[A].loc[i]), set(f[A].loc[j]), set(f[A].loc[i]) & set(f[A].loc[j]))
                    list.append(j)
def f2():
    for i in range(changdu // 4, changdu // 4 * 2):
        if i not in list:
            for j in range(f.shape[0]):
                chongfu = set(f[A].loc[i]) & set(f[A].loc[j])
                if chongfu != kong and chongfu != set():
                    print(set(f[A].loc[i]), set(f[A].loc[j]), set(f[A].loc[i]) & set(f[A].loc[j]))
                    list.append(j)
def f3():
    for i in range(changdu // 4 * 2, changdu // 4 * 3):
        if i not in list:
            for j in range(f.shape[0]):
                chongfu = set(f[A].loc[i]) & set(f[A].loc[j])
                if chongfu != kong and chongfu != set():
                    print(set(f[A].loc[i]), set(f[A].loc[j]), set(f[A].loc[i]) & set(f[A].loc[j]))
                    list.append(j)
def f4():
    for i in range(changdu // 4 * 3, changdu):
        if i not in list:
            for j in range(f.shape[0]):
                chongfu = set(f[A].loc[i]) & set(f[A].loc[j])
                if chongfu != kong and chongfu != set():
                    print(set(f[A].loc[i]), set(f[A].loc[j]), set(f[A].loc[i]) & set(f[A].loc[j]))
                    list.append(j)

try:
    _thread.start_new_thread(f1())
    _thread.start_new_thread(f2())
    _thread.start_new_thread(f3())
    _thread.start_new_thread(f4())
except:
    print('錯誤')

f.drop(list)
f.to_csv('刪除之後的中成藥_{}.csv'.format(f.shape[0]),encoding='utf-8')
print(list)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章