python爬蟲篇1——爬取中英文論文文獻數據

程序運行截圖:

mysql代碼:

CREATE TABLE `article` (
  `id` int(11) NOT NULL,
  `article_time` varchar(50) DEFAULT NULL,
  `article_volume` varchar(20) DEFAULT NULL,
  `article_author` varchar(2000) DEFAULT NULL,
  `article_name_english` varchar(2000) DEFAULT NULL,
  `article_name_chinese` varchar(2000) DEFAULT NULL,
  `article_content_english` varchar(5000) DEFAULT NULL,
  `article_content_chinese` varchar(2000) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8

python代碼:

import random
import re
import requests
import pymysql

# 打開數據庫連接
db = pymysql.connect(host='localhost',
                     port=8080,
                     user='root',
                     passwd='123',
                     db='students',
                     charset='utf8')
# 使用 cursor() 方法創建一個遊標對象 cursor
cursor = db.cursor()


# 功能:獲取歷年的論文文獻名中英文,作者名,摘要中英文,時間

# 翻譯接口 parm: content is english
def translator_chinese(content):
    """英文翻譯成中文"""
    Tranlator_URL = "http://fy.iciba.com/ajax.php?a=fy&f=en&t=zh-CHS&w=%s" % ('"' + content + '"')
    urls = re.findall(r'"out":"(.*?)","ci', requests.get(Tranlator_URL).text, re.S)
    if len(urls) > 0:
        result = (urls[0].encode('ascii').decode('unicode_escape')).replace("“", "").replace("”", "")
        return result
    else:
        return ""


# for test
# print(translator_chinese(" therefore, be treated as a unity of contradictions."))

#通過年份獲取數據
def get_data(year):
    """通過年份獲取文獻卷宗"""
    JZ_URL = "https://journals.sagepub.com/loi/oss?year=%i" % year
    respose = requests.get(JZ_URL)
    print("*" * 300)
    print("開始爬取%s年的文獻數據!" % year)
    # 獲取卷宗
    jz = (re.findall(r'class="expander".*?data-attr-vol="(.*?)"', respose.text, re.S))[1]
    print("卷宗:" + jz)
    # 獲取文獻url
    article_ml = re.findall(r'class="row js_issue".*?href="(.*?)"', respose.text, re.S)
    print("文獻目錄地址:")
    for i in range(0, len(article_ml)):
        print(str(i + 1) + "." + article_ml[i])
    print("*" * 300)
    for temp in article_ml:
        data = requests.get(temp)
        article_time = re.findall(r'<div class="journalNavTitle">\n(.*?)\n</div>', data.text, re.S)
        # 獲取文獻時間
        time = article_time[0][article_time[0].index(",") + 1:len(article_time[0])]
        print("文獻時間:" + time)
        # 獲取文獻地址
        addr = re.findall(r'class="ref nowrap" href="(.*?)"', data.text, re.S)
        Basic_URL = "https://journals.sagepub.com"
        print("文獻列表地址:")
        for lb in range(0, len(addr)):
            print(str(lb + 1) + "." + addr[lb])
        for ad in addr:
            # 獲取每個文獻內容
            print("*" * 300)
            article_data = requests.get(Basic_URL + ad)
            article_c = re.findall(r'property="og:title" content="(.*?)"', article_data.text, re.S)
            if len(article_c) > 0:
                if "-" in article_c[0]:
                    # 獲取文獻作者
                    article_author = article_c[0][article_c[0].index("-") + 1:len(article_c[0])]
                    # 獲取文獻名
                    article_name_english = article_c[0][0:article_c[0].index("-")]
                    article_name_chinese = translator_chinese(article_name_english)
                    print("文獻英文名字:" + article_name_english)
                    print("文獻中文名字:" + article_name_chinese)
                    print("作者名字:" + article_author)
                else:
                    article_author = ""
                    article_name_english = article_c[0]
                    article_name_chinese = translator_chinese(article_name_english)
                    print("文獻英文名字:" + article_name_english)
                    print("文獻中文名字:" + article_name_chinese)
                    print("作者名字:" + article_author)
            else:
                break
            # 獲取文獻摘要
            article_content_data = re.findall(r'<div class="abstractSection abstractInFull"><p>(.*?)</p>',
                                              article_data.text, re.S)
            if len(article_content_data) > 0:
                article_content_english = article_content_data[0]
                article_content_chinese = translator_chinese(article_content_data[0])
                print("英文摘要:" + article_content_english)  # 英文摘要
                print("中文摘要:" + article_content_chinese)  # 中文摘要
            else:
                article_content_english = ""
                article_content_chinese = ""  # 中英文摘要都爲空
                print("英文摘要:" + article_content_english)  # 英文摘要
                print("中文摘要:" + article_content_chinese)  # 中文摘要

            # 數據寫入數據庫
            id = random.randint(0, 999999999)
            sql = """insert into article(id,article_time,article_volume,article_author,article_name_english,article_name_chinese,
            article_content_english,article_content_chinese) values(%i,%s,%s,%s,%s,%s,%s,%s) """ % (
                id, "'" + time + "'", "'" + jz + "'", "'" + pymysql.escape_string(article_author) + "'",
                "'" + pymysql.escape_string(article_name_english) + "'",
                "'" + pymysql.escape_string(article_name_chinese) + "'",
                "'" + pymysql.escape_string(article_content_english) + "'",
                "'" + pymysql.escape_string(article_content_chinese) + "'")
            cursor.execute(sql)
            # # 提交到數據庫執行
            print("id:%i數據爬取成功!" % id)
            db.commit()


# 主函數
if __name__ == '__main__':
    for year in range(2015, 2017):
        get_data(year)
    else:
        print("數據爬取完成!")
        db.close()

程序可能存在部分bug,歡迎交流指正。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章