python模擬登陸爬取webserver 輸出結果

#科研過程中,有時候需要批量爬取Webserver預測得到的靜態(動態)數據。
以磷酸化位點信息爲例,用python實現其過程。

import re
import urllib
import numpy as np
from bs4 import BeautifulSoup

def crawl_web_phosp(seq_all):
    url = "http://www.dabi.temple.edu/disphos/pred/predict"
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
        "Host": "www.dabi.temple.edu",
        "Origin": "http://www.dabi.temple.edu",
        "Referer": "http://www.dabi.temple.edu/disphos/",
        "Content-Type": "multipart/form-data; boundary=----WebKitFormBoundarykgdpT9IlfSthlGa3",
        "Accept-Encoding": "gzip, deflate",
        "DNT": "1"
    }
    formdata = {
        "seq": seq_all,
        "seqfile": "(binary)",
        "org": "0",
        "submit": "Predict"
    }
    data = urllib.parse.urlencode(formdata).encode('utf-8')
    request = urllib.request.Request(url, data=data)
    response = urllib.request.urlopen(request)
    html = response.read()
    soup = BeautifulSoup(html,features="html.parser")
    trs = soup.find_all("tr")

    pho_site_set = []
    pho_score_set = []
    for i in range(1,len(trs)):
        string_trs = trs[i]
        s = list(filter(None,re.split('<.{2,5}>',str(string_trs))))
        pho_site = int(s[0])
        pho_score = float(s[2])
        pho_site_set.append(pho_site)
        pho_score_set.append(pho_score)    
    seq = seq_all.split('\n')[-1]
    seq_score = np.zeros((len(seq),len(seq)))

    for i in range(len(pho_site_set)):
        phosite = pho_site_set[i]
        if phosite > len(seq)-4:
            seq_score[phosite-4-1:len(seq),phosite-1] = [pho_score_set[i]]*(len(seq)-phosite +5)
        elif phosite  <=4:
            seq_score[0:phosite+4,phosite-1] = [pho_score_set[i]]*(phosite+4)
        else:
            seq_score[phosite-4-1:phosite+4,phosite-1] = [pho_score_set[i]]*9

    score_mean_set = []
    for j in range(len(seq_score)):
        if np.sum(seq_score[j,]) != 0:
            score_mean = round(np.mean(list(filter(None,list(seq_score[j,])))),4)
            score_mean_set.append(score_mean)
        elif  np.sum(seq_score[j,]) == 0:
            score_mean_set.append(0)
    score = str(score_mean_set).strip('[]').split(',')
    return score

#輸入爲蛋白質氨基酸序列片段(25-5000)
#輸出score是對原始預測數據處理之後得到的得分,每一個得分對應每個殘基磷酸化程度

#原始webserve界面:

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章