【數據分析】基於新聞文本數據分析

由於詞彙敏感之類的原因,每次發佈都審覈失敗,於是轉爲圖片上傳了!相關代碼在文末附錄中。

數據來源:2020記憶:報道、非虛構與個人敘述(持續更新)

 

 

 

附錄:

t1.py:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/2/12 13:02
# @Author  : ystraw
# @Site    : 
# @File    : t1.py
# @Software: PyCharm Community Edition
# @function: 從github指定鏈接中進行數據獲取
#            獲取鏈接後,根據鏈接的不同來源,進行不同網頁的抓取文章內容

import requests
import time
import datetime
from bs4 import BeautifulSoup
from openpyxl import Workbook
import random
from lxml import etree
from openpyxl import load_workbook
import getIpPool
proxies = getIpPool.getproxies()
MAX_num = 15    # Ip取值範圍
openFlag = 1   # 0關閉Ip代理,1開啓Ip代理
outTime = 10   # 超時時間

# 寫入文件:, 新建不追加:
def writeFile(filename, file):
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(file)
    print(filename, '已寫入!')
    f.close()
# 寫入文件:, 新建追加:
def writeFile_add(filename, file):
    with open(filename, 'a', encoding='utf-8') as f:
        f.write(file)
    print(filename, '已寫入!')
    f.close()
# 讀入文件
def readFile(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        str = f.read()
    print(filename, '已讀入!')
    f.close()
    return str
# 寫入Excel
def write_excel_xls(path, sheet_name, value, bHead):
    # 獲取需要寫入數據的行數
    index = len(value)
    # 獲取需要寫入數據的行數
    index = len(value)
    wb = Workbook()
    # 激活 worksheet
    ws = wb.active
    # 第一行輸入
    ws.append(bHead)
    # .cell(row=x, column=2, value=z.project)
    for i in range(2, index+2):
        for j in range(1, len(value[i-2]) + 1):
            # ws.append(value[i])
            ws.cell(row=i, column=j, value=value[i-2][j-1])
    # 保存
    wb.save(path)
    print(path + '表格寫入數據成功!')

# 採集github的文章鏈接
def getUrl(path, url):
    ua_list = [
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    ]
    agent = random.choice(ua_list)
    headers = {
        "Connection": "keep-alive",
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
        "User-Agent": agent,
    }
    # url = 'https://www.amazon.com/b/ref=AE_HP_leftnav_automotive?_encoding=UTF8&ie=UTF8&node=2562090011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-leftnav&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_t=101&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_i=17938598011'
    print('請求地址:', url)
    html = requests.get(url, headers=headers, verify=False).text
    # writeFile('data/top250.html', html)
    # xpath:提取信息(標題)
    text =etree.HTML(html)
    trs = text.xpath('//div[@class="Box-body"]//tbody/tr/td[2]/text()')
    # bs4: 提取信息:
    bs = BeautifulSoup(html, 'lxml')
    div = bs.findAll('div', attrs={'class': 'Box-body'})[0]
    # print(div)
    trList = div.findAll('tr')
    # print(len(trList))
    cnt = 0
    # 全部數據
    alldata = []
    for tr in trList:
        tds = tr.findAll('td')
        if tds != []:
            # 提取:日期,標題
            tempList = [tds[0].string, trs[cnt]]
            # 提取:【原始URL,截圖,翻譯,Archive】的鏈接
            for i in range(2, 6):
                astring = ''
                aList = tds[i].findAll('a')
                for a in aList:
                    astring += a['href'] + ','
                tempList.append(astring.strip(','))
            print(tempList)
            alldata.append(tempList)
            cnt += 1
    tableHead = ['日期', '標題', '原始URL', '截圖', '翻譯', 'Archive']
    write_excel_xls(path, 'link', alldata, tableHead)

# 提取微信文章
def getdetailContent_1(url):
    ua_list = [
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    ]
    agent = random.choice(ua_list)
    headers = {
        "Connection": "keep-alive",
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
        "User-Agent": agent,
    }
    # url = 'https://www.amazon.com/b/ref=AE_HP_leftnav_automotive?_encoding=UTF8&ie=UTF8&node=2562090011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-leftnav&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_t=101&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_i=17938598011'
    print('請求地址:', url)
    # html = requests.get(url, timeout=10, headers=headers, verify=False).text
    global proxies
    global openFlag
    ip = proxies[random.randint(0, MAX_num if len(proxies) > MAX_num else len(proxies)-1)]
    if openFlag == 1:
        html = requests.get(url, timeout=outTime, headers=headers, proxies={ip[0]: ip[1]}, verify=False).text
        # print(ip)
    else:
        html = requests.get(url, timeout=outTime, headers = headers, verify=False).text
    # print(html)
    text = etree.HTML(html)
    context = text.xpath('string(//div[@class="rich_media_content "])').replace(' ', '').replace('\n', '')
    # print(context.replace(' ', '').replace('\n', ''))
    return context

# 提取財經網
def getdetailContent_2(url):
    ua_list = [
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    ]
    agent = random.choice(ua_list)
    headers = {
        "Connection": "keep-alive",
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
        "User-Agent": agent,
    }
    # url = 'https://www.amazon.com/b/ref=AE_HP_leftnav_automotive?_encoding=UTF8&ie=UTF8&node=2562090011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-leftnav&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_t=101&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_i=17938598011'
    print('請求地址:', url)
    # html = requests.get(url, timeout=10, headers=headers, verify=False).text
    global proxies
    global openFlag
    ip = proxies[random.randint(0, MAX_num if len(proxies) > MAX_num else len(proxies)-1)]
    if openFlag == 1:
        html = requests.get(url, timeout=outTime, headers=headers, proxies={ip[0]: ip[1]}, verify=False).text
        # print(ip)
    else:
        html = requests.get(url, timeout=outTime, headers = headers, verify=False).text
    # print(html)
    text = etree.HTML(html)
    context = text.xpath('string(//div[@id="Main_Content_Val"])')
    # print(context.replace(' ', '').replace('\n', ''))
    # print('===============')
    return context

# 經濟觀察網
def getdetailContent_3(url):
    ua_list = [
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    ]
    agent = random.choice(ua_list)
    headers = {
        "Connection": "keep-alive",
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
        "User-Agent": agent,
    }
    # url = 'https://www.amazon.com/b/ref=AE_HP_leftnav_automotive?_encoding=UTF8&ie=UTF8&node=2562090011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-leftnav&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_t=101&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_i=17938598011'
    print('請求地址:', url)
    # 解決亂碼
    # html = requests.get(url, timeout=10, headers=headers, verify=False).text.encode('iso-8859-1')
    global proxies
    global openFlag
    ip = proxies[random.randint(0, MAX_num if len(proxies) > MAX_num else len(proxies)-1)]
    if openFlag == 1:
        html = requests.get(url, timeout=outTime, headers=headers, proxies={ip[0]: ip[1]}, verify=False).text.encode('iso-8859-1')
        # print(ip)
    else:
        html = requests.get(url, timeout=outTime, headers = headers, verify=False).text.encode('iso-8859-1')
    # print(html)
    text = etree.HTML(html)
    context = text.xpath('string(//div[@class="xx_boxsing"])')
    # print(context.replace(' ', '').replace('\n', ''))
    # print('===============')
    return context

# 方方博客
def getdetailContent_4(url):
    ua_list = [
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    ]
    agent = random.choice(ua_list)
    headers = {
        "Connection": "keep-alive",
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
        "User-Agent": agent,
    }
    # url = 'https://www.amazon.com/b/ref=AE_HP_leftnav_automotive?_encoding=UTF8&ie=UTF8&node=2562090011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-leftnav&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_t=101&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_i=17938598011'
    print('請求地址:', url)
    # 解決亂碼
    # html = requests.get(url, timeout=10, headers=headers, verify=False).text
    global proxies
    global openFlag
    ip = proxies[random.randint(0, MAX_num if len(proxies) > MAX_num else len(proxies)-1)]
    if openFlag == 1:
        html = requests.get(url, timeout=outTime, headers=headers, proxies={ip[0]: ip[1]}, verify=False).text
        # print(ip)
    else:
        html = requests.get(url, timeout=outTime, headers = headers, verify=False).text

# print(html)
    text = etree.HTML(html)
    context = text.xpath('string(//div[@class="blog_content"])')
    # print(context.replace(' ', '').replace('\n', ''))
    # print('===============')
    return context

# 中國經營網專題
def getdetailContent_5(url):
    ua_list = [
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    ]
    agent = random.choice(ua_list)
    headers = {
        "Connection": "keep-alive",
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
        "User-Agent": agent,
    }
    # url = 'https://www.amazon.com/b/ref=AE_HP_leftnav_automotive?_encoding=UTF8&ie=UTF8&node=2562090011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-leftnav&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_t=101&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_i=17938598011'
    print('請求地址:', url)
    # 解決亂碼
    # html = requests.get(url, timeout=10, headers=headers, verify=False).text
    global proxies
    global openFlag
    ip = proxies[random.randint(0, MAX_num if len(proxies) > MAX_num else len(proxies)-1)]
    if openFlag == 1:
        html = requests.get(url, timeout=outTime, headers=headers, proxies={ip[0]: ip[1]}, verify=False).text
        # print(ip)
    else:
        html = requests.get(url, timeout=outTime, headers = headers, verify=False).text

    # print(html)
    text = etree.HTML(html)
    context = text.xpath('string(//div[@class="contentleft auto"])')
    # print(context.replace(' ', '').replace('\n', ''))
    # print('===============')
    return context

# 界面網
def getdetailContent_6(url):
    ua_list = [
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    ]
    agent = random.choice(ua_list)
    headers = {
        "Connection": "keep-alive",
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
        "User-Agent": agent,
    }
    # url = 'https://www.amazon.com/b/ref=AE_HP_leftnav_automotive?_encoding=UTF8&ie=UTF8&node=2562090011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-leftnav&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_r=PH0GKS39512064NYAJ2G&pf_rd_t=101&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_p=72757377-d3c0-4c64-a969-e12706417b85&pf_rd_i=17938598011'
    print('請求地址:', url)
    # 解決亂碼
    # html = requests.get(url, timeout=10, headers=headers, verify=False).text
    global proxies
    global openFlag
    ip = proxies[random.randint(0, MAX_num if len(proxies) > MAX_num else len(proxies)-1)]
    if openFlag == 1:
        html = requests.get(url, timeout=outTime, headers=headers, proxies={ip[0]: ip[1]}, verify=False).text
        # print(ip)
    else:
        html = requests.get(url, timeout=outTime, headers = headers, verify=False).text

    # print(html)
    text = etree.HTML(html)
    context = text.xpath('string(//div[@class="article-content"])')
    # print(context.replace(' ', '').replace('\n', ''))
    # print('===============')
    return context

# 從excel中獲取
def getContent(path, savePath):
    # 讀取數據
    wb = load_workbook(path)
    sheet_names = wb.get_sheet_names()
    table = wb.get_sheet_by_name(sheet_names[0]) # index爲0爲第一張表
    nrows = table.max_row  # 行
    ncols = table.max_column  # 列
    print(nrows, ncols)
    cnt = 0
    alldata = []
    for i in range(2, nrows+1):
        templist = []
        for j in range(1, ncols+1):
            # print(table.cell(i, j).value)
            templist.append(table.cell(i, j).value)
        # 獲取詳情鏈接:
        url = table.cell(i, 3).value.split(',')[0]
        try:
            if url[:24] == 'https://mp.weixin.qq.com':
                # 微信公共號獲取文章'
                content = getdetailContent_1(url)
                templist.append('微信公共號')
                templist.append(content)
                # print(content)
                # pass
            elif url[:24] == 'http://china.caixin.com/' or url[:22] == 'http://www.caixin.com/' or url[:25] == 'http://weekly.caixin.com/':
                # 財新網獲取文章
                content = getdetailContent_2(url)
                templist.append('財新網')
                templist.append(content)
                # print(content)
                # pass
            elif url[:22] == 'http://www.eeo.com.cn/':
                # 經濟觀察網
                # # print('經濟觀察網', table.cell(i, 3).value)
                content = getdetailContent_3(url)
                templist.append('經濟觀察網')
                templist.append(content)
                # print(content)
                # pass
            elif url[:32] == 'http://fangfang.blog.caixin.com/':
                # 方方博客
                content = getdetailContent_4(url)
                templist.append('方方博客')
                templist.append(content)
                # print(content)
                # pass
            elif url[:21] == 'http://www.cb.com.cn/':
                # # 中國經營網專題
                content = getdetailContent_5(url)
                templist.append('中國經營網')
                templist.append(content)
                # # print(content)
                pass
            elif url[:24] == 'https://www.jiemian.com/':
                # 界面網
                content = getdetailContent_6(url)
                templist.append('界面網')
                templist.append(content)
                # print(content)
                # pass
            else:
                # print('else', table.cell(i, 3).value, '===', table.cell(i, 2).value)
                cnt += 1
                # print(table.cell(i, 3).value, table.cell(i, 5).value)
            alldata.append(templist)
        except Exception as ex:
            print('異常:', ex)
        # if i >= 10:
        #     break
        # time.sleep(random.randint(0, 2))
    print('剔除的:', cnt)
    tableHead = ['日期', '標題', '原始URL', '截圖', '翻譯', 'Archive','文章來源', '文章內容']
    write_excel_xls(savePath, 'link', alldata, tableHead)

if __name__ == '__main__':
    '''
     第一步:獲取鏈接
    '''
    # 數據地址
    # url = 'https://github.com/2019ncovmemory/nCovMemory#%E7%AC%AC%E4%B8%80%E8%B4%A2%E7%BB%8Fyimagazine'
    # # 保存文件路徑:
    # path = './data/all_text_2.xlsx'
    # getUrl(path, url)
    '''
     第二步:通過鏈接提取文章內容
    '''
    # url = 'https://web.archive.org/web/20200204084331/http://www.caixin.com/2020-02-04/101511377.html'
    # 讀取鏈接文件地址:
    path = './data/all_text_link_2.xlsx'
    # 保存路徑:
    savePath = './data/text_0.xlsx'
    getContent(path, savePath)

t2.py:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/2/13 13:46
# @Author  : ystraw
# @Site    : 
# @File    : t2.py
# @Software: PyCharm Community Edition
# @function: 對t1獲得的 alltext.xlsx 進行必要的處理
# 1、刪除空行

import numpy
import pandas as pd
import jieba

# 讀入文件
def readFile(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        str = f.read()
    print(filename, '已讀入!')
    f.close()
    return str

# 刪除空行
def dealNull(path, savepath):
    data = pd.read_excel(path, sheet_name=0)
    df = pd.DataFrame(data)
    # print(data.head())  # 無參時默認讀前五行
    # print(data.tail())  # 無參時默認讀後五行
    print(data.shape)        # 查看數據大小
    print(data.columns)      # 查看數據的列索引
    # 數據表基本信息(維度、列名稱、數據格式、所佔空間等)
    print(data.info())
    # 每一列數據的格式
    print('格式:\n', df.dtypes)
    # 讀取某列的某行數據
    # df['文章內容'].astype('str')
    # df['文章內容'] = df['文章內容'].map(str.strip)
    # print(data['文章內容'].at[123])
    # 讀取表格數據內容(不包括標題)
    # print(data.values)
    # 判斷每一行的文章內容是否爲空
    data_notnull = data['文章內容'].notnull()
    # print(data_notnull)
    # 刪除空行
    data_new = data[data_notnull]
    # print(data_new)
    print('刪除空行之後的大小:\n', data_new.shape)
    # 保存文件
    data_new.to_excel(savepath, index=False, header=True)

# 分詞並統計詞頻
def fenci(content):
    # 讀入停留詞文件:
    sword = readFile('./data/stopword.txt')
    # 構建停留詞詞典:
    sword = sword.split('\n')
    worddict = {}
    wordlist = []
    for w in jieba.cut(content, cut_all=False):  # cut_all=False爲精確模式,=True爲全模式
        # print(w)
        if (w not in sword) and w != '' and w != ' ' and w != None and w != '\n' and len(w) >= 2:
            # print(w + '-')
            wordlist.append(w)
            try:
                worddict[w] = worddict[w] + 1
            except:
                worddict[w] = 1
    # print(worddict)
    return [worddict, wordlist]

# 數據預處理
def preDeal(path, savepath):
    # 讀取數據
    data = pd.read_excel(path, sheet_name=0)
    df = pd.DataFrame(data)
    # 加一列
    df['文章內容分詞'] = None
    for i in range(df.shape[0]):
        # 進行分詞
        rt = fenci(df['文章內容'].at[i])
        df['文章內容分詞'].at[i] = ' '.join(rt[1])
    # 保存文件
    df.to_excel(savepath, index=False, header=True)

if __name__ == '__main__':
    '''
    數據清洗
    '''
    # # 刪除空行
    # path = './data/text_0.xlsx'
    # savepath = './data/text_1.xlsx'
    # dealNull(path, savepath)

    '''
    數據預處理
    '''
    path = './data/text_1.xlsx'
    savepath = './data/text_2.xlsx'
    preDeal(path, savepath)

t3.py:

# 導入Geo包,注意1.x版本的導入跟0.x版本的導入差別
# 更新方法:pip install  --upgrade pyecharts
from pyecharts.charts import Geo
# 導入配置項
from pyecharts import options as opts
# ChartType:圖標類型,SymbolType:標記點類型
from pyecharts .globals import ChartType, SymbolType

# 讀入文件
def readFile(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        str = f.read()
    print(filename, '已讀入!')
    f.close()
    return str

geo = Geo()
# 新增座標點,添加名稱跟經緯度
# 讀入城市座標數據:
zb_city = readFile('./data/1-5LineCity_2.txt')
# geo.add_coordinate(name="China",longitude=104.195,latitude=35.675)
cityList = zb_city.split('\n')
for cy in cityList:
    if cy == '' or cy == None:
        continue
    temp = cy.split(',')
    geo.add_coordinate(name=temp[0], longitude=temp[2], latitude=temp[1])

# 地圖類型,世界地圖可換爲world
geo.add_schema(maptype="china")
# 獲取權重:
cityList = readFile('./data/city_node.csv').split('\n')
data = []
for i in range(len(cityList)):
    city = cityList[i]
    if i == 0 or city == '' or city == None:
        continue
    data.append((city.split(' ')[0], int(city.split(' ')[2])))
# print(data)
# 獲取流向
cityList = readFile('./data/city_edge.csv').split('\n')
data2 = []
for i in range(len(cityList)):
    city = cityList[i]
    if i == 0 or city == '' or city == None:
        continue
    # 共現次數較少的不展示:
    if int(city.split(' ')[2]) < 200:
        continue
    data2.append((city.split(' ')[0], city.split(' ')[1]))
# print(data2)
# 添加數據點
# geo.add("",[("北京",10),("上海",20),("廣州",30),("成都",40),("哈爾濱",50)],type_=ChartType.EFFECT_SCATTER)
geo.add("", data, type_=ChartType.EFFECT_SCATTER)
# 添加流向,type_設置爲LINES,漣漪配置爲箭頭,提供的標記類型包括 'circle', 'rect', 'roundRect', 'triangle',
#'diamond', 'pin', 'arrow', 'none'
geo.add("geo-lines",
        data2,
        type_=ChartType.LINES,
        effect_opts=opts.EffectOpts(symbol=SymbolType.ARROW,symbol_size=10,color="yellow"),
        linestyle_opts=opts.LineStyleOpts(curve=0.2),
        is_large=True)
# 不顯示標籤
geo.set_series_opts(label_opts=opts.LabelOpts(is_show=True))
# 設置圖標標題,visualmap_opts=opts.VisualMapOpts()爲左下角的視覺映射配置項
geo.set_global_opts(visualmap_opts=opts.VisualMapOpts(),title_opts=opts.TitleOpts(title="城市動態流向圖"))
# 直接在notebook裏顯示圖表
geo.render_notebook()
# 生成html文件,可傳入位置參數
geo.render("城市動態流向圖.html")

dataAnalysis.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/2/17 18:42
# @Author  : ystraw
# @Site    : 
# @File    : dataAnalysis.py
# @Software: PyCharm Community Edition
# @function: 進行數據分析
import folium
import codecs
from folium.plugins import HeatMap
from pyecharts.charts import Geo
from pyecharts.charts import Map
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 讀入文件
def readFile(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        str = f.read()
    print(filename, '已讀入!')
    f.close()
    return str

# 描述性分析
def ms_analysis(filepath):
    # 中文和負號的正常顯示
    plt.rcParams['font.sans-serif'] = ['KaiTi']
    plt.rcParams['font.serif'] = ['KaiTi']
    # 讀入數據
    data = pd.read_excel(filepath)
    '''
    發文數量、來源和發文日期
    '''
    # 繪製文章發佈量與日期及來源關係圖:
    # data.groupby('日期')['文章來源'].value_counts().unstack().fillna(value=0).plot(kind='bar', title='文章發佈量分日統計')
    # plt.show()
    # return

    '''
    城市提及分析
    '''
    # 讀入城市數據,構建城市字典:
    city = readFile('./data/1-5LineCity_2.txt')
    cityList = city.split('\n')
    # print(cityList)
    # 構建城市頻率:
    cityDict = {}
    for cy in cityList:
        if cy == '' or cy == None:
            continue
        temp = cy.split(',')
        cityDict[temp[0][:-1]] = 0
    # print(cityDict)
    print(data.shape[0], data.shape[1])
    # 進行統計
    for i in range(data.shape[0]):
        wordList = data['文章內容分詞'].at[i].split(' ')
        for word in wordList:
            try:
                cityDict[word] += 1
            except:
                pass
        # break
    print(cityDict)

    # 繪製地圖:
    # 取字典中的值
    # provice = list(cityDict.keys())
    # values = list(cityDict.values())
    # 填充數據
    data = []
    for cy in cityList:
        if cy == '' or cy == None:
            continue
        temp = cy.split(',')
        data.append([float(temp[1]), float(temp[2]), cityDict[temp[0][:-1]]])
    # data=[[ 31.235929,121.480539, 1500 ]] #
    print(data)
    map_osm = folium.Map([33., 113.], zoom_start=12)    #繪製Map,開始縮放程度是5倍
    HeatMap(data).add_to(map_osm)  # 將熱力圖添加到前面建立的map裏
    map_osm.save('./image/文章提及城市分佈.html')#將繪製好的地圖保存爲html文件

# 得到城市共現矩陣文件
def city_gx_analysis(filepath):
    citys = {}			# 城市字典
    relationships = {} 	# 關係字典
    lineCitys = []		# 每篇城市關係

    # 構建城市集合:
    cityList = readFile('./data/1-5LineCity.txt').split('\n')
    citySet = set()
    for city in cityList:
        citySet.add(city.replace('市', ''))

    # 讀入分詞數據
    data = pd.read_excel(filepath)
    # 填充鄰接矩陣
    for i in range(data.shape[0]):
        wordList = data['文章內容分詞'].at[i].split(' ')
        lineCitys.append([])
        for word in wordList:
            if word not in citySet:
                continue
            lineCitys[-1].append(word)
            if citys.get(word) is None:
                citys[word] = 0
                relationships[word] = {}
            # 出現次數加1
            citys[word] += 1
    # explore relationships
    for line in lineCitys:					# 對於每一段
        for city1 in line:
            for city2 in line:				# 每段中的任意兩個城市
                if city1 == city2:
                    continue
                if relationships[city1].get(city2) is None:		# 若兩個城市尚未同時出現則新建項
                    relationships[city1][city2]= 1
                else:
                    relationships[city1][city2] = relationships[city1][city2]+ 1		# 兩個城市共同出現次數加 1
    # output
    with codecs.open("./data/city_node.csv", "w", "utf-8") as f:
        f.write("Id Label Weight\r\n")
        for city, times in citys.items():
            f.write(city + " " + city + " " + str(times) + "\r\n")

    with codecs.open("./data/city_edge.csv", "w", "utf-8") as f:
        f.write("Source Target Weight\r\n")
        for city, edges in relationships.items():
            for v, w in edges.items():
                if w > 3:
                    f.write(city + " " + v + " " + str(w) + "\r\n")

if __name__ == '__main__':
    filepath = './data/text_2.xlsx'
    # 描述性分析
    # ms_analysis(filepath)

    # 分析城市間的共現關係
    city_gx_analysis(filepath)

TF-IDF.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/2/23 22:20
# @Author  : ystraw
# @Site    : 
# @File    : TF-IDF.py
# @Software: PyCharm Community Edition
# @function: 對文本內容進行關鍵詞提取

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from openpyxl import Workbook

# 寫入Excel
def write_excel_xls(path, sheet_name, value, bHead):
    # 獲取需要寫入數據的行數
    index = len(value)
    # 獲取需要寫入數據的行數
    index = len(value)
    wb = Workbook()
    # 激活 worksheet
    ws = wb.active
    # 第一行輸入
    ws.append(bHead)
    # .cell(row=x, column=2, value=z.project)
    for i in range(2, index+2):
        for j in range(1, len(value[i-2]) + 1):
            # ws.append(value[i])
            ws.cell(row=i, column=j, value=value[i-2][j-1])
    # 保存
    wb.save(path)
    print(path + '表格寫入數據成功!')

def TQ():
    # 讀入數據
    filepath = './data/text_2.xlsx'
    data = pd.read_excel(filepath)
    document = list(data['文章內容分詞'])
    # print(document)
    # print(len(document))

    # min_df: 當構建詞彙表時,嚴格忽略低於給出閾值的文檔頻率的詞條,語料指定的停用詞。如果是浮點值,該參數代表文檔的比例,整型絕對計數值,如果詞彙表不爲None,此參數被忽略。
    tfidf_model = TfidfVectorizer(min_df=0.023).fit(document)
    # 得到語料庫所有不重複的詞
    feature = tfidf_model.get_feature_names()
    # print(feature)
    # print(len(feature))
    # ['一切', '一條', '便是', '全宇宙', '天狗', '日來', '星球']
    # 得到每個特徵對應的id值:即上面數組的下標
    # print(tfidf_model.vocabulary_)
    # {'一條': 1, '天狗': 4, '日來': 5, '一切': 0, '星球': 6, '全宇宙': 3, '便是': 2}

    # 每一行中的指定特徵的tf-idf值:
    sparse_result = tfidf_model.transform(document)
    # print(sparse_result)

    # 每一個語料中包含的各個特徵值的tf-idf值:
    # 每一行代表一個預料,每一列代表這一行代表的語料中包含這個詞的tf-idf值,不包含則爲空
    weight = sparse_result.toarray()

    # 構建詞與tf-idf的字典:
    feature_TFIDF = {}
    for i in range(len(weight)):
        for j in range(len(feature)):
            # print(feature[j], weight[i][j])
            if feature[j] not in feature_TFIDF:
                feature_TFIDF[feature[j]] = weight[i][j]
            else:
                feature_TFIDF[feature[j]] = max(feature_TFIDF[feature[j]], weight[i][j])
    # print(feature_TFIDF)
    # 按值排序:
    print('TF-IDF 排名前十的:')
    alldata = []
    featureList = sorted(feature_TFIDF.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
    for i in range(1, 600 if len(featureList) > 600 else len(featureList)):
        print(featureList[i][0], featureList[i][1])
        alldata.append([featureList[i][0], featureList[i][1]])
    # 寫入文件:
    tableHead = ['關鍵詞', 'TF-IDF']
    import datetime
    filetime = str(datetime.datetime.now()).replace('-', '').replace(' ', '_').replace(':', '_')[:17]
    write_excel_xls('./data/關鍵詞_' + filetime + '.xlsx', 'link', alldata, tableHead)

def drawWordCloud():
    from wordcloud import WordCloud
    from scipy.misc import imread
    # 讀入數據
    filepath = './data/text_2.xlsx'
    data = pd.read_excel(filepath)
    document = list(data['文章內容分詞'])
    # 整理文本:
    # words = '一切 一條 便是 全宇宙 天狗 日來 星球' # 樣例
    words = ''.join(document)
    # print(words)
    # 設置背景圖片:
    b_mask = imread('./image/ciyun.webp')
    # 繪製詞圖:
    wc = WordCloud(
        background_color="white", #背景顏色
        max_words=2000, #顯示最大詞數
        font_path="./image/simkai.ttf",  #使用字體
        # min_font_size=5,
        # max_font_size=80,
        # width=400,  #圖幅寬度
        mask=b_mask
    )
    wc.generate(words)
    # 準備一個寫入的背景圖片
    wc.to_file("./image/beijing_2.jpg")

if __name__ == '__main__':
    '''
    提取關鍵詞
    '''
    # TQ()

    '''
    繪製詞雲圖片
    '''
    drawWordCloud()

LDA_主題模型.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/2/26 14:10
# @Author  : ystraw
# @Site    : 
# @File    : LDA_主題模型.py
# @Software: PyCharm Community Edition
# @function:
import pandas as pd
import numpy as np

def LDA():
    # 讀入數據
    filepath = './data/text_2.xlsx'
    data = pd.read_excel(filepath)
    document = list(data['文章內容分詞'])

    # 獲取詞頻向量:
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.decomposition import LatentDirichletAllocation
    corpus = document
    cntVector = CountVectorizer()
    cntTf = cntVector.fit_transform(corpus)
    # 輸出選取詞特徵
    vocs = cntVector.get_feature_names()
    print('主題詞袋:', len(vocs))
    # print(vocs)
    # 輸出即爲所有文檔中各個詞的詞頻向量
    # print(cntTf)

    # LDA主題模型
    lda = LatentDirichletAllocation(n_components=4,  # 主題個數
                                    max_iter=5,      # EM算法的最大迭代次數
                                    learning_method='online',
                                    learning_offset=20., # 僅僅在算法使用online時有意義,取值要大於1。用來減小前面訓練樣本批次對最終模型的影響
                                    random_state=0)
    docres = lda.fit_transform(cntTf)
    # 類別所屬概率
    LDA_corpus = np.array(docres)
    print('類別所屬概率:\n', LDA_corpus)
    # 每篇文章中對每個特徵詞的所屬概率矩陣:list長度等於分類數量
    # print('主題詞所屬矩陣:\n', lda.components_)
    # 找到最大值所在列,確定屬於的類別:
    arr = pd.DataFrame(LDA_corpus)
    data['主題類別'] = np.argmax(LDA_corpus, axis=1)  # 求最大值所在索引
    data['主題出現概率']=arr.max(axis=1)          # 求行最大值
    print('所屬類別:\n',  data.head())
    data.to_excel('./data/LDA_主題分佈_類別.xlsx', index=False)
    # return

    # 打印每個單詞的主題的權重值
    tt_matrix = lda.components_
    # 類別id
    id = 0
    # 存儲數據
    datalist = []
    for tt_m in tt_matrix:
        # 元組形式
        tt_dict = [(name, tt) for name, tt in zip(vocs, tt_m)]
        tt_dict = sorted(tt_dict, key=lambda x: x[1], reverse=True)
        # 打印權重值大於0.6的主題詞:
        # tt_dict = [tt_threshold for tt_threshold in tt_dict if tt_threshold[1] > 0.6]
        # 打印每個類別的前20個主題詞:
        tt_dict = tt_dict[:20]
        print('主題%d:' % id, tt_dict)
        # 存儲:
        datalist += [[tt_dict[i][0], tt_dict[i][1], id]for i in range(len(tt_dict))]
        id += 1
    # 存入excel:
    # df = pd.DataFrame(datalist, columns=['特徵詞', '權重', '類別'])
    # df.to_excel('./data/LDA_主題分佈3.xlsx', index=False)

if __name__ == '__main__':
    '''
        利用LDA主題模型進行主題提取:
    '''
    LDA()

Snownlp情感分析.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/2/27 12:54
# @Author  : ystraw
# @Site    : 
# @File    : Snownlp情感分析.py
# @Software: PyCharm Community Edition
# @function: 進行情感分析

import pandas as pd
from snownlp import SnowNLP

def qgjs():
    # 讀入數據
    data = pd.read_excel('./data/LDA_主題分佈_類別.xlsx')
    # print(data.shape)
    # 進行情感打分
    score = []
    for i in range(0, data.shape[0]):
        s = SnowNLP(data['標題'].at[i])
        score.append(s.sentiments)
    data['情緒得分'] = score
    print(data.head())
    data.to_excel('./data/情緒得分.xlsx', index=False)

if __name__ == '__main__':
    qgjs()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章