本文記錄了在創新工場實習過程中對藥物說明書進行操作的代碼,全部是進行規則匹配和正則字符串沒什麼技術含量,僅做個記錄方便搬運
# 處理爬取的藥物說明書數據集時使用的代碼,藥物說明書鏈接:https://pan.baidu.com/s/1v1puWYLY8lDnbKKEPVTFgg 密碼:jjw7
import re
import pandas as pd
f = open('./藥品說明書(除西藥)/健客.txt','r')
data = []
num = 0
# print(clounms)
#for line in f.readlines():
#line = re.sub('\[|\]|\'|"',string=line,repl='')
#list = line.split('\t')
#line_data = {}
#jingji = ''
# yaopingdizhi = list[0]
# line_data['藥品地址'] = yaopingdizhi
# print(yaopingdizhi)
# yaopingming = list[1].split(' ')[0].replace('價格','')# 藥品名稱在後面一併處理
# line_data['藥品名_藥盒'] = yaopingming
# # print(line_data['藥品名_藥盒'])
#
# list2 = list[2].split(',')
# if len(list2) != 2:# 非藥品有兩個標籤,藥品有三個標籤,非藥品沒有是否處方藥
# chufangyao = ''
# yaopingleibie = ''
# yibao = ''
# else:
# chufangyao = ''
# yaopingleibie = list2[0]
# yibao = list2[1]
# line_data['是否處方藥'] = chufangyao
# line_data['藥品類別'] =yaopingleibie
# line_data['是否醫保'] = yibao
#
#
#
# # print(list[3])
# jiage = float(re.sub('¥|,',string=list[3],repl=''))
# line_data['價格'] = jiage
# # print(jiage)
#
#
# pizhunwenhao = re.sub('批准文號:',string = list[4],repl='').replace(' ','').replace('國藥準字','').replace('國藥準','').replace('註冊證號','').replace('醫療機構製劑','').replace('國準字','').replace('該藥已做變更','')
# pizhunwenhao = re.sub('國?藥?準?字?',string = pizhunwenhao,repl='').replace('食健','').replace('+','/').replace('京制','')
# # if not re.match(pattern='.*[A-Z]{1,}[0-9]{6,}.*',string=pizhunwenhao):
# # pizhunwenhao=''
# line_data['批准文號'] = pizhunwenhao
# # print('批准文號',pizhunwenhao)
# #
# yaopingshuomingcanshu = list[5].strip().split(',')
#
# for i in yaopingshuomingcanshu:
# if '藥品名稱:' in i :
# yaopingming_shuomingshu = re.sub(pattern='漢語拼音.*',string=i,repl='').replace(':','').replace(':','')
# yaopingming_shuomingshu = re.sub(pattern='英文名.*',string=i,repl='')
# yaopingming_shuomingshu = re.sub(pattern='曾用名.*',string=i,repl='')
# yaopingming_shuomingshu = re.sub(pattern='藥品名稱:?:?',string=i,repl='')
# if yaopingming_shuomingshu == yaopingming:
# yaopingming_shuomingshu = ''
# line_data['藥品名_說明書'] = yaopingming_shuomingshu
# # print(yaopingming_shuomingshu)
# #
# #
# if '藥品規格:' in i :
# yaopingguige = i.replace('藥品規格:', '')
# line_data['藥品規格'] = yaopingguige
# #print(yaopingguige)
#
# if '藥品單位' in i :
# yaopingdanwei = re.sub('生產廠家:.*',string=i,repl='').replace('藥品單位:','').replace('/盒','')
# yaopingdanwei = re.sub('大|中|小',string=yaopingdanwei,repl='')
# line_data['藥品單位'] = yaopingdanwei
# #print(yaopingdanwei)
#
# if '生產廠家' in i:
# shenchanchangjia = re.sub('.*生產廠家[:是]',string=i,repl='')
# shenchanchangjia = re.sub(',.*',string=shenchanchangjia,repl='')
# # print(shenchanchangjia)
# line_data['生產廠家'] = shenchanchangjia
# # print(shenchanchangjia)
#
#
# if '主治疾病' in i :
# zhuzhijibing = i.replace('主治疾病:','')
# # print(i)
# line_data['主治疾病'] = zhuzhijibing
# # print(zhuzhijibing)
#
#
# if '醫師建議' and '註冊證號' in i :
# # print('醫師建議與註冊證號',i)
# try:
# xiugaizuhcezhenghao = i.split('原註冊證號')[0]
# except:
# pass
# xiugaizuhcezhenghao = re.sub(pattern='醫師建議:註冊證號',string=i,repl='')
# xiugaizuhcezhenghao = re.sub('國藥準字',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub('批准文號:?',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub(pattern='[注]?冊證號[:]?',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub(pattern='注*冊證號',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub('原註冊證號.*',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub('分包裝.*',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub('公司名稱.*',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub('原.*',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub('產品.*',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub(':|:|備註|醫師建議',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub('醫師建議注?冊?證?號?',string=xiugaizuhcezhenghao,repl='')
# xiugaizuhcezhenghao = re.sub('&.*',string=xiugaizuhcezhenghao,repl='')
# if re.match(pattern='[A-Z]+[0-9]*',string=xiugaizuhcezhenghao):
# if xiugaizuhcezhenghao == pizhunwenhao:
# # print('批准證號未修改',pizhunwenhao,xiugaizuhcezhenghao)
# xiugaizuhcezhenghao = ''
# line_data['修改之後的批准文號'] = xiugaizuhcezhenghao
# else:
# line_data['修改之後的批准文號'] = ''
# # print(line_data['修改之後的批准文號'])
# elif '醫師建議' in i :
# yishijianyi = re.sub(pattern='.*醫師建議:?',string=i,repl='')
# # print(i)
# line_data['醫師建議'] = yishijianyi
# #print(line_data['醫師建議'])
#
# #
# yaopingxiangxishuomingshu = list[6].strip().split(',')
# for j in yaopingxiangxishuomingshu:
# if '藥品名稱' in j :
# yaopingming_xiangxishuomingshu = re.sub(pattern='.*藥品名稱?:?:?',string=j,repl='').replace(':','').replace(':','')
# line_data['藥品名_詳細說明書']= yaopingming_xiangxishuomingshu
# # print(line_data['藥品名_詳細說明書'])
# #
# if '商品名/商標' in j:
# shangbiao = re.sub(pattern='商品名/商標:',string=j,repl='')
# line_data['商標'] = shangbiao
# # print(shangbiao)
# #
#
# if '規格' in j :
# guige_xiangxishuomingshu = re.sub(pattern='規格:',string=j,repl='')
# if len(guige_xiangxishuomingshu) >50:
# #print(guige_xiangxishuomingshu)
# guige_xiangxishuomingshu = ''
# if guige_xiangxishuomingshu!= yaopingguige:
# line_data['詳細說明書中的藥品規格'] = guige_xiangxishuomingshu
# #print('前後規格不一致',yaopingguige,guige_xiangxishuomingshu)
# #
# #
# if '適應症' in j :
# shiyingzheng = re.sub(pattern='適應症:?:?',string=j,repl='')
# line_data['適應症'] = shiyingzheng
# #print(shiyingzheng)
# if '用法用量' in j :
# yongfayongliang = re.sub(pattern='用法用量:?:?',string=j,repl='')
# line_data['用法用量'] = yongfayongliang
# #print(yongfayongliang)
# #
# if '不良反應:' in j :
# buliangfanying = re.sub(pattern='不良反應:?:?',string=j,repl='')
# line_data['不良反應'] = buliangfanying
# #print(buliangfanying)
# # # 藥品詳細說明書(藥品名,商標,規格,適應症,用法用量,不良反應,禁忌,注意事項,藥理毒理,生產廠家,批准文號,生產地址,條形碼)']
# if '禁忌' in j:
# jingji = re.sub(pattern='禁忌:?:?',string=j,repl='')
# line_data['禁忌'] = jingji
# #print(jingji)
# if '注意事項' in j :
# zhuyishixiang = re.sub(pattern='注意事項:?:?',string=j,repl='')
# line_data['注意事項'] = zhuyishixiang
# if jingji == zhuyishixiang:
# line_data['注意事項'] = ''
# # print(zhuyishixiang)
# line_data['藥理毒理'] = ''
# if '藥理毒理' in j :
# bingliduli = re.sub(pattern='藥理毒理:',string=j,repl='')
# line_data['藥理毒理'] = bingliduli
# if '生產廠家' in j :
# shengchanchagnjia = re.sub(pattern='生產廠家:?',string=j,repl='')
# line_data['生產廠家'] = shengchanchagnjia
#
# if '生產地址' in j :
# shengchandizhi = re.sub(pattern='生產地址:',string=j,repl='')
# line_data['生產地址'] = shengchandizhi
# if '條形碼' in j :
# tiaoxingma = re.sub(pattern='條形碼:?',string=j,repl='')
# if not re.match(pattern='.*[0-9]{7,}',string=tiaoxingma):
# # print('錯誤',tiaoxingma,j)
# tiaoxingma = ''
# line_data['條形碼'] = tiaoxingma
# #
# #
# #
# #
# if yaopingming_xiangxishuomingshu == yaopingming_shuomingshu and yaopingming_xiangxishuomingshu == yaopingming and yaopingming == yaopingming_shuomingshu:
# # print('前中後一致')
# pass
# elif yaopingming_xiangxishuomingshu == yaopingming:
# # print('前後相等')
# line_data['藥品名_詳細說明書'] = ''
# elif yaopingming_xiangxishuomingshu == yaopingming_shuomingshu:
# # print('中後相等')
# line_data['藥品名_詳細說明書'] = ''
# else:
# pass
# # print('都不相等',yaopingming,yaopingming_shuomingshu,yaopingming_xiangxishuomingshu)
#
# # print(yaopingming)
# # print(yaopingming_shuomingshu)
# # print(yaopingming_xiangxishuomingshu,'\n\n\n')
# flag = 0
# fenges = ['商品名', '商品名稱', '通用名', '通用名稱', '漢語拼音', '英文名稱', '成份', '英文名', '曾用名','劑型','本品主要成分','主要成分','成分','本品']
# for fenge in fenges:
# if fenge in yaopingming_shuomingshu:
# flag = 1
# break
# if flag == 1:
# yaopingming_shuomingshu = re.sub(pattern='醫師建議:?:?',string=yaopingming_shuomingshu,repl='')
# if '通用名稱' or '通用名' in yaopingming_shuomingshu:
# # print(yaopingming_shuomingshu)
# tongyongming = re.sub(pattern='.*通用名稱?:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','').replace('品名','')
# # print(tongyongming)
# # print(tongyongming)
# # tongyongming1=''
# for t in fenges:
# if t in tongyongming:
# # print(t)
# tongyongming = tongyongming.split(t)[0]
# # print(tongyongming)
# # break
# # print(tongyongming)
# line_data['通用名'] = tongyongming
# # print(line_data['通用名'])
#
# if '商品名稱' or '商品名' in yaopingming_shuomingshu:
# shangpinmingcheng = re.sub(pattern='.*商品名稱?:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','').replace('品名','')
# # print(tongyongming)
# # tongyongming1=''
# for t in fenges:
# if t in shangpinmingcheng:
# # print(t)
# shangpinmingcheng = shangpinmingcheng.split(t)[0]
# # break
# line_data['商品名稱'] = shangpinmingcheng
#
# if '漢語拼音' in yaopingming_shuomingshu:
# hanyupinyin = re.sub(pattern='.*漢語拼音:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','')
# # print(tongyongming)
# # tongyongming1=''
# for t in fenges:
# if t in hanyupinyin:
# # print(t)
# hanyupinyin = hanyupinyin.split(t)[0]
# # break
# line_data['漢語拼音'] = hanyupinyin
# line_data['英文名稱'] = ''
# if '英文名稱' or '英文名' in yaopingming_shuomingshu:
# # print(yaopingming_shuomingshu)
# yingwenmincheng = re.sub(pattern='.*英文名稱?:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','')
# # print(tongyongming)
# # tongyongming1=''
# for t in fenges:
# if t in yingwenmincheng:
# # print(t)
# yingwenmincheng = yingwenmincheng.split(t)[0]
# # break
# if re.match(pattern='[a-zA-Z]*',string=yingwenmincheng):
# # print(yingwenmincheng)
# line_data['英文名稱'] = yingwenmincheng
# else:
# line_data['英文名稱'] = ''
#
# line_data['曾用名'] = ''
# if '曾用名' in yaopingming_shuomingshu:
# cengyongming = re.sub(pattern='.*曾用名次?:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','')
# # print(tongyongming)
# # tongyongming1=''
# for t in fenges:
# if t in cengyongming:
# # print(t)
# cengyongming = cengyongming.split(t)[0]
# # break
# line_data['曾用名'] = cengyongming
#
# line_data['劑型'] = ''
# if '劑型' in yaopingming_shuomingshu:
# jixing = re.sub(pattern='.*劑型:?:?', string=yaopingming_shuomingshu, repl='').replace(':', '').replace('。','')
# # print(tongyongming)
# # tongyongming1=''
# for t in fenges:
# if t in jixing:
# # print(t)
# jixing = jixing.split(t)[0]
# # print(jixing)
# # break
# line_data['劑型'] = jixing
# # print(jixing)
#
# data.append(line_data)
# df = pd.DataFrame(data)
# df = df.fillna('')
# jishu =0
# clounms = ['藥品名_藥盒','藥品名_說明書','藥品名_詳細說明書','通用名','曾用名','商品名稱','英文名稱','漢語拼音','劑型','是否處方藥','藥品類別','是否醫保','價格','批准文號','修改之後的批准文號',
# '藥品規格','詳細說明書中的藥品規格','藥品單位','用法用量','適應症','主治疾病','不良反應','禁忌','注意事項','藥理毒理','醫師建議','生產地址','生產廠家','藥品地址','條形碼','商標']
# df = df[clounms]
# df.to_csv('健客.csv',index=False)
# 將得到的藥品說明書進行去重,得到所有的藥品樣品數量
import pandas as pd
import _thread
ff = pd.read_csv(open('中成藥.csv','r',encoding='utf-8'))
list=[]
A =['藥品名_藥盒','藥品名_詳細說明書','通用名','曾用名','商品名稱','英文名稱','漢語拼音']
kong = set(ff[A].loc[0])&set(ff[A].loc[1])
f = ff.drop_duplicates(['藥品名_藥盒']).reset_index(drop=True)
changdu = int(f.shape[0])
def f1():
for i in range(0, changdu // 4):
if i not in list:
for j in range(f.shape[0]):
chongfu = set(f[A].loc[i]) & set(f[A].loc[j])
if chongfu != kong and chongfu != set():
print(set(f[A].loc[i]), set(f[A].loc[j]), set(f[A].loc[i]) & set(f[A].loc[j]))
list.append(j)
def f2():
for i in range(changdu // 4, changdu // 4 * 2):
if i not in list:
for j in range(f.shape[0]):
chongfu = set(f[A].loc[i]) & set(f[A].loc[j])
if chongfu != kong and chongfu != set():
print(set(f[A].loc[i]), set(f[A].loc[j]), set(f[A].loc[i]) & set(f[A].loc[j]))
list.append(j)
def f3():
for i in range(changdu // 4 * 2, changdu // 4 * 3):
if i not in list:
for j in range(f.shape[0]):
chongfu = set(f[A].loc[i]) & set(f[A].loc[j])
if chongfu != kong and chongfu != set():
print(set(f[A].loc[i]), set(f[A].loc[j]), set(f[A].loc[i]) & set(f[A].loc[j]))
list.append(j)
def f4():
for i in range(changdu // 4 * 3, changdu):
if i not in list:
for j in range(f.shape[0]):
chongfu = set(f[A].loc[i]) & set(f[A].loc[j])
if chongfu != kong and chongfu != set():
print(set(f[A].loc[i]), set(f[A].loc[j]), set(f[A].loc[i]) & set(f[A].loc[j]))
list.append(j)
try:
_thread.start_new_thread(f1())
_thread.start_new_thread(f2())
_thread.start_new_thread(f3())
_thread.start_new_thread(f4())
except:
print('錯誤')
f.drop(list)
f.to_csv('刪除之後的中成藥_{}.csv'.format(f.shape[0]),encoding='utf-8')
print(list)