1.根據一個表格是基因,一個工作簿裏不同的癌種基因信息放在不同的表格中,要查找表1的每個基因在工作簿裏的哪些癌種中存在
#!/usr/bin/env python
# encoding=utf-8
# 目的是根據基因在另一個工作簿中各個表格匹配癌種
import pandas as pd
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
df = pd.ExcelFile('../allgene.xlsx')
keylist=[]
valuelist =[]
liquid_gene = pd.read_excel('../liquid_ biospy.xlsx',header=0)
liquid_gene_list = liquid_gene.values.tolist()
# 讀取工作簿裏的每個表格的名字,然後以每個表格的癌種名字作爲鍵值,每個表格中gene的列作爲字典的值
for name in df.sheet_names:
filename = pd.read_excel('../allgene.xlsx',sheet_name=name,header=0)
value = filename['Gene'].values.tolist()
keylist.append(name)
valuelist.append(value)
# 分別把名字和gene放入列表,然後利用zip函數進行一一匹配,再把列表轉換爲字典。
dic =dict(zip(keylist,valuelist))
#print dic
liquid = {}
#根據gene去每個癌種的值(gene)尋找,如果能夠找到就把這個癌種作爲該基因的值
for gene in liquid_gene_list:
val = []
for key in dic:
if gene[0] in dic[key]:
if gene[0] not in liquid.keys(): #判斷之前是否包含在字典的鍵中,不含時直接加入字典
liquid[gene[0]] = key
else: #如果包含時,需要把之前的值加入一個列表中,後面的直接加入列表中
val.append(key)
liquid[gene[0]]= val
output = open('../res1.txt','w')
# 把字典寫入txt文件
for k,v in liquid.items():
if isinstance(v,list):
#import pdb;pdb.set_trace()
text = ','.join('%s' % id for id in v) #把每個值列表轉換爲str
else:
text = str(v) #直接列表轉換爲str
output.write(k+'\t'+text+'\n')
2.把word批量轉換爲pdf,用python有點慢,如果超過200大批量的話,還是用java,幾秒鐘的時間
# !/usr/bin/env python
# coding=utf-8
from multiprocessing import Pool
import os
from win32com.client import Dispatch, constants, gencache
def doc2pdf(doc_name, pdf_name):
gencache.EnsureModule('{00020905-0000-0000-C000-000000000046}', 0, 8, 4)
w = Dispatch("Word.Application")
try:
doc = w.Documents.Open(doc_name, ReadOnly=1)
doc.ExportAsFixedFormat(pdf_name, constants.wdExportFormatPDF, Item=constants.wdExportDocumentWithMarkup,
CreateBookmarks=constants.wdExportCreateHeadingBookmarks)
except Exception as e:
print(e)
return 1
finally:
w.Quit(constants.wdDoNotSaveChanges)
if __name__=='__main__':
print('Parent process %s.' % os.getpid())
p = Pool(2)
for root, ds, fs in os.walk('C:\\Users\\guosheng\\Desktop\\20200426\\'): # 網上這步沒有寫入放入word的路徑,就會導致轉成功後不知道pdf在哪裏,這步很關鍵。
for f in fs:
fullname = os.path.join(root, f)
if 'docx' in fullname:
name, suffix = fullname.split(".")
pdf = "{}.pdf".format(name)
p.apply_async(doc2pdf, args=(fullname, pdf))
p.close()
p.join()
print('All subprocesses done.')
3.根據工作簿裏面的每個表格名字把pdf文件根據名字移動到相對應的表格名字文件夾下
#!bin/python
#encoding = utf-8
import numpy
import pandas as pd
import glob,os,sys
import shutil
reload(sys)
sys.setdefaultencoding('utf-8')
df = pd.ExcelFile('total.xlsx')
#print df.sheet_names
for name in df.sheet_names:
1 if os.path.exists(name) else os.mkdir(name) #判斷文件夾是否存在,不存在時就建立
pathname = os.path.join(os.getcwd(),name)
filename = 'df_'+name
filename = pd.read_excel('total.xlsx',sheet_name = name,header =1)
for i in filename.index.tolist():
pdf_name = filename.iloc[i,1]+"-"+filename.iloc[i,2]+'.pdf'
#import pdb;pdb.set_trace()
shutil.move(os.path.join(os.getcwd(),pdf_name),os.path.join(pathname,pdf_name)) #移動原有的文件
#複製某個路徑的文件到另外一個文件夾下
for i in open('filename.txt','rb'):
file = i.split('\n')[0]
#import pdb;pdb.set_trace()
path = os.path.join(os.getcwd(),file)+'/Target_hg19_work_dir/2.result/tumor/result_variation/snp_indel/'
old_filename = os.path.join(path,'tumor.3tools.hg19_multianno.csv')
newfile = file+"_tumor.3tools.hg19_multianno.csv"
new_filename = os.path.join(os.path.join(os.getcwd(),'variants'),newfile)
shutil.copy(old_filename,new_filename)