最近分析一批panel測序數據,Panel中包含一些針對Fusion的捕獲探針,使用delly等call SV的軟件效果並不好。所以嘗試使用genefuse這個軟件。
genefuse這個軟件是根據一些先驗的可能形成Fusion的基因對,直接從fastq文件檢測Fusion,我覺得可能會被沒有先驗知識的caller的性能更好一些。但是,我做的是血液腫瘤,而官方提供的先驗Fusion文件中沒有包括血液腫瘤中常見的Fusion基因(如RUNX1-RUNX1T1)。官方給了Fusion文件的方法也總是報錯(主要是那個腳本是julia寫的,我不會……)。所以就自己動手豐衣足食了。
根據作者提供的信息以及觀察fusion文件格式,感覺可以從GTF文件中直接抽取相關信息。代碼如下:
#!/usr/bin/env python3
from collections import OrderedDict
import logging
logging.basicConfig(level=logging.WARNING)
def main(gtf, genelist,mode):
# read genelist
with open(genelist, 'r') as li:
target_genelist = [i.strip() for i in li]
# initialize geneblock
gene_block = {
"empty": True,
"gene": {},
"transcript": OrderedDict()
}
# read lines
with open(gtf, 'r') as g:
for record in g:
# the data structure looks like:
# {
# empty = False
#
# gene:"DDX41"
#
# uniq_gene_id = ENSGxxxxxx
#
# transcript(){
#
# ENSTxxxxxxxx.3.xxxx.xxxx----exon {
# trasncription_information{}
# exon{}
# cds{}
# }
# }
if record.strip().startswith("#"):
continue
a = parse_oneline(record)
if a['feature'] == "gene":
if gene_block['empty'] == True:
try:
gene_block['gene'] = a
gene_block['uniq_gene_id'] = ".".join(
[a['attr']['gene_id'], a['attr']['gene_version']])
gene_block['gene_name'] = a['attr']['gene_name']
gene_block['empty'] = False
except:
raise Exception(
"The Freature line could not be inserted into geneblock")
else:
"""
something to process geneblock
"""
process_gene_block(gene_block, target_genelist,mode)
# initialize gene_block (load new gene record)
gene_block['empty'] = False
gene_block['uniq_gene_id'] = ".".join(
[a['attr']['gene_id'], a['attr']['gene_version']])
gene_block['gene_name'] = a['attr']['gene_name']
gene_block['gene'] = a
gene_block['transcript'] = OrderedDict()
else:
uniq_trans_id = ".".join(
[a['attr']['transcript_id'], a['attr']['transcript_version']])
if a['feature'] == 'transcript':
# check if the new trancript id is exits, if not insert into gene_block
if uniq_trans_id not in gene_block['transcript'].keys():
gene_block['transcript'][uniq_trans_id] = {}
gene_block['transcript'][uniq_trans_id]['transcript_information'] = a
else:
for k in gene_block['transcript'].keys():
if uniq_trans_id == k:
general_uniq_id = uniq_trans_id + "." + \
a['start'] + "." + a['end'] + \
"---" + a['feature']
gene_block['transcript'][k][general_uniq_id] = a
"""
do someting for the last gene_block
"""
process_gene_block(gene_block, target_genelist,mode)
def process_gene_block(gene_block, target_genelist,mode):
# build in mode:
# 1. all[print all transcripts,the gene name is transcript_name ]
# 2. longest
# 3. first transcript
select_transcript = ['None', 0]
genename = gene_block['gene_name'].strip()
if genename in target_genelist:
logging.info(
"[notice] found {0} in GTF file, start to extract information...".format(genename))
if mode == "longest":
for k, v in gene_block['transcript'].items():
if v['transcript_information']['attr']['gene_biotype'] == 'protein_coding':
current_transcript_length = int(
v['transcript_information']['end']) - int(v['transcript_information']['start'])
if current_transcript_length > select_transcript[1]:
select_transcript[0] = k
select_transcript[1] = current_transcript_length
if select_transcript[0] == 'None':
logging.warning(
"[Waring] do not found protein_coding transcript in {0}".format(genename))
else:
print_transcript(gene_block['transcript'][select_transcript[0]])
elif mode == "all":
for k, v in gene_block['transcript'].items():
if v['transcript_information']['attr']['gene_biotype'] == 'protein_coding':
print_transcript(gene_block['transcript'][k])
def print_transcript(trans):
"""
>EML4_ENST00000318522.5,chr2:42396490-42559688
1,42396490,42396776
2,42472645,42472827
"""
out = []
header_genename = trans['transcript_information']['attr']['gene_name']
header_ENST = trans['transcript_information']['attr']['transcript_id']
header_ENST_ver = trans['transcript_information']['attr']['transcript_version']
header_chr = trans['transcript_information']['chr']
header_start = trans['transcript_information']['start']
header_end = trans['transcript_information']['end']
out.append(">"+header_genename + "_"+header_ENST+"."+header_ENST_ver +
",chr"+header_chr + ":" + header_start + "-" + header_end)
for k, i in enumerate(trans.keys()):
if i != 'transcript_information':
sp = i.split("---")[0].split(".")
out.append(",".join([str(k+1), sp[2], sp[3]]))
for j in out:
print(j)
print("")
def parse_oneline(line):
r = [i.strip() for i in line.strip().split("\t")]
chro, sourceo, featureo, st, ed, scoreo, strando, frameo, attributeo = r
attr_dict = parse_attr(attributeo)
tmp_dict = {}
tmp_dict['chr'] = chro
tmp_dict['source'] = sourceo
tmp_dict['feature'] = featureo
tmp_dict['start'] = st
tmp_dict['end'] = ed
tmp_dict['score'] = scoreo
tmp_dict['strand'] = strando
tmp_dict['frame'] = frameo
tmp_dict['attr'] = attr_dict
return(tmp_dict)
def parse_attr(attr):
a = {i.split(" ")[0].strip(): i.split(" ")[1].rstrip(
";").strip('"') for i in attr.split("; ")}
return(a)
if __name__ == "__main__":
# config
mode = "longest"
gtf = "/Disk384G/biodatabase/Homo_sapiens.GRCh37.87.gtf"
genelist = './genelist.txt'
main(gtf, genelist,mode)
GTF格式很不方便,因爲每一個行並不是獨立的,一個gene行下包含多個transcript行,每個transcript 行下包含多個exon和CDS還有其他一堆亂七八糟的行。所以處理GTF文件我覺得最好是將文件解析成一個一個的gene_block。這個代碼對GTF文件實現了簡單的解析,按行讀取,根據gene -- transcript -- [exon/CDS....] 這種結構返回一個字典對象,實現了我的目的了,其他細節沒太考慮。