#科研過程中,有時候需要批量爬取Webserver預測得到的靜態(動態)數據。
以磷酸化位點信息爲例,用python實現其過程。
import re
import urllib
import numpy as np
from bs4 import BeautifulSoup
def crawl_web_phosp(seq_all):
url = "http://www.dabi.temple.edu/disphos/pred/predict"
headers = {
"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Host": "www.dabi.temple.edu",
"Origin": "http://www.dabi.temple.edu",
"Referer": "http://www.dabi.temple.edu/disphos/",
"Content-Type": "multipart/form-data; boundary=----WebKitFormBoundarykgdpT9IlfSthlGa3",
"Accept-Encoding": "gzip, deflate",
"DNT": "1"
}
formdata = {
"seq": seq_all,
"seqfile": "(binary)",
"org": "0",
"submit": "Predict"
}
data = urllib.parse.urlencode(formdata).encode('utf-8')
request = urllib.request.Request(url, data=data)
response = urllib.request.urlopen(request)
html = response.read()
soup = BeautifulSoup(html,features="html.parser")
trs = soup.find_all("tr")
pho_site_set = []
pho_score_set = []
for i in range(1,len(trs)):
string_trs = trs[i]
s = list(filter(None,re.split('<.{2,5}>',str(string_trs))))
pho_site = int(s[0])
pho_score = float(s[2])
pho_site_set.append(pho_site)
pho_score_set.append(pho_score)
seq = seq_all.split('\n')[-1]
seq_score = np.zeros((len(seq),len(seq)))
for i in range(len(pho_site_set)):
phosite = pho_site_set[i]
if phosite > len(seq)-4:
seq_score[phosite-4-1:len(seq),phosite-1] = [pho_score_set[i]]*(len(seq)-phosite +5)
elif phosite <=4:
seq_score[0:phosite+4,phosite-1] = [pho_score_set[i]]*(phosite+4)
else:
seq_score[phosite-4-1:phosite+4,phosite-1] = [pho_score_set[i]]*9
score_mean_set = []
for j in range(len(seq_score)):
if np.sum(seq_score[j,]) != 0:
score_mean = round(np.mean(list(filter(None,list(seq_score[j,])))),4)
score_mean_set.append(score_mean)
elif np.sum(seq_score[j,]) == 0:
score_mean_set.append(0)
score = str(score_mean_set).strip('[]').split(',')
return score
#輸入爲蛋白質氨基酸序列片段(25-5000)
#輸出score是對原始預測數據處理之後得到的得分,每一個得分對應每個殘基磷酸化程度
#原始webserve界面: