爬蟲學習筆記-部分網站記錄1

 Twitter

# -*- coding:utf-8 -*-

import os
import sys
import json
import time
import datetime
import xlsxwriter

from utils.twitter_client import ClientService

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)

client_service = ClientService()


class TwitterService(object):

    # 關注者
    @staticmethod
    def read_followers_by_screen_name(screen_name, cursor='-1', count=200):
        top_parent_dir = os.path.join(os.path.dirname(__file__), 'static')
        parent_dir = os.path.join(top_parent_dir, screen_name, 'followers')
        if not os.path.exists(parent_dir):
            os.makedirs(parent_dir)

        dest_file_list = os.listdir(parent_dir)
        if dest_file_list is not None and len(dest_file_list) > 0:
            dest_file = sorted(dest_file_list, key=lambda k: k.split('_')[-1], reverse=True)[0]
            with open(os.path.join(parent_dir, dest_file), 'r') as rf:
                return rf.readline()

        url = 'https://api.twitter.com/1.1/followers/list.json?screen_name=$screen_name&cursor=$cursor&count=$count&skip_status=true&include_user_entities=false'
        url = url.replace('$screen_name', screen_name).replace('$cursor', str(cursor)).replace('$count', str(count))
        client = client_service.get_client()
        try:
            resp, content = client.request(url, method='GET', body='', headers=None)
        except Exception as exp:
            print exp.message

        result = json.loads(content)
        if 'errors' in result:
            client_service.remove_client(client)
            client = client_service.get_client()
            resp, content = client.request(url, method='GET', body='', headers=None)

        dest_file = os.path.join(parent_dir, TwitterService.append_timestamp_suffix(screen_name + '_' + cursor))
        with open(dest_file, 'w') as wf:
            wf.write(str(content) + '\n')

        return content

    # 正在關注
    @staticmethod
    def read_followings_by_screen_name(screen_name, cursor='-1', count=200):
        top_parent_dir = os.path.join(os.path.dirname(__file__), 'static')
        parent_dir = os.path.join(top_parent_dir, screen_name, 'followings')
        if not os.path.exists(parent_dir):
            os.makedirs(parent_dir)

        dest_file_list = os.listdir(parent_dir)
        if dest_file_list is not None and len(dest_file_list) > 0:
            dest_file = sorted(dest_file_list, key=lambda k: k.split('_')[-1], reverse=True)[0]
            with open(os.path.join(parent_dir, dest_file), 'r') as rf:
                return rf.readline()

        url = 'https://api.twitter.com/1.1/friends/list.json?screen_name=$screen_name&cursor=$cursor&count=$count&skip_status=true&include_user_entities=false'
        url = url.replace('$screen_name', screen_name).replace('$cursor', str(cursor)).replace('$count', str(count))
        client = client_service.get_client()
        try:
            resp, content = client.request(url, method='GET', body='', headers=None)
        except Exception as exp:
            print exp.message

        result = json.loads(content)
        if 'errors' in result:
            client_service.remove_client(client)
            client = client_service.get_client()
            resp, content = client.request(url, method='GET', body='', headers=None)

        dest_file = os.path.join(parent_dir, TwitterService.append_timestamp_suffix(screen_name + '_' + cursor))
        with open(dest_file, 'w') as wf:
            wf.write(str(content) + '\n')

        return content

    # 推文
    @staticmethod
    def read_user_timeline_by_screen_name(screen_name, max_id=None, count=200):
        top_parent_dir = os.path.join(os.path.dirname(__file__), 'static')
        parent_dir = os.path.join(top_parent_dir, screen_name, 'timeline')
        if not os.path.exists(parent_dir):
            os.makedirs(parent_dir)

        dest_file_list = os.listdir(parent_dir)
        if dest_file_list is not None and len(dest_file_list) > 0:
            dest_file = sorted(dest_file_list, key=lambda k: k.split('_')[-1], reverse=True)[0]
            with open(os.path.join(parent_dir, dest_file), 'r') as rf:
                return rf.readline()

        url = 'https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=$screen_name&count=$count&exclude_replies=true'
        if max_id is not None:
            url = url + '&max_id=$max_id'
        url = url.replace('$screen_name', screen_name).replace('$count', str(count)).replace('$max_id', str(max_id))
        client = client_service.get_client()
        try:
            resp, content = client.request(url, method='GET', body='', headers=None)
        except Exception as exp:
            print exp.message

        result = json.loads(content)
        if 'errors' in result:
            client_service.remove_client(client)
            client = client_service.get_client()
            resp, content = client.request(url, method='GET', body='', headers=None)

        if content is not None and len(content) > 2:
            dest_file = os.path.join(parent_dir, TwitterService.append_timestamp_suffix(
                screen_name + '_' + '-1' if max_id is None else str(max_id)))
            with open(dest_file, 'w') as wf:
                wf.write(str(content) + '\n')

        return content

    # 推文
    @staticmethod
    def read_user_timeline_by_user_id(user_id, max_id=None, count=200):
        url = 'https://api.twitter.com/1.1/statuses/user_timeline.json?user_id=$user_id&count=$count&exclude_replies=true'
        if max_id is not None:
            url = url + '&max_id=$max_id'
        url = url.replace('$user_id', user_id).replace('$count', str(count)).replace('$max_id', str(max_id))
        client = client_service.get_client()
        try:
            resp, content = client.request(url, method='GET', body='', headers=None)
        except Exception as exp:
            print exp.message
        return content

    # 用戶信息
    @staticmethod
    def read_user_info_by_screen_name(screen_name):
        top_parent_dir = os.path.join(os.path.dirname(__file__), 'static')
        parent_dir = os.path.join(top_parent_dir, screen_name, 'profile')
        if not os.path.exists(parent_dir):
            os.makedirs(parent_dir)

        TwitterService.read_followers_by_screen_name(screen_name)

        dest_file_list = os.listdir(parent_dir)
        if dest_file_list is not None and len(dest_file_list) > 0:
            dest_file = sorted(dest_file_list, key=lambda k: k.split('_')[-1], reverse=True)[0]
            with open(os.path.join(parent_dir, dest_file), 'r') as rf:
                return rf.readline()

        url = 'https://api.twitter.com/1.1/users/show.json?screen_name=$screen_name'
        url = url.replace('$screen_name', screen_name)
        client = client_service.get_client()
        try:
            resp, content = client.request(url, method='GET', body='', headers=None)
        except Exception as exp:
            print exp.message

        result = json.loads(content)
        if 'errors' in result:
            client_service.remove_client(client)
            client = client_service.get_client()
            resp, content = client.request(url, method='GET', body='', headers=None)

        dest_file = os.path.join(parent_dir, TwitterService.append_timestamp_suffix(screen_name))
        with open(dest_file, 'w') as wf:
            wf.write(str(content) + '\n\n')

        return content

    @staticmethod
    def append_timestamp_suffix(prefix):
        return prefix + '_' + str(int(time.time()))


class TwitterRlTimeService(object):

    @staticmethod
    def read_user_rltime_followers(screen_name):
        parent_dir = os.path.join(os.path.dirname(__file__), 'download')
        if not os.path.exists(parent_dir):
            os.makedirs(parent_dir)

        filename = os.path.join(parent_dir, screen_name + '_followers.xlsx')
        workbook = xlsxwriter.Workbook(filename=filename)
        worksheet = workbook.add_worksheet(name='Sheet1')

        url = 'https://api.twitter.com/1.1/followers/list.json?screen_name=$screen_name&cursor=$cursor&count=$count&skip_status=true&include_user_entities=false'
        cursor, count, row_count = -1, 200, 0
        while True:
            t_url = url.replace('$screen_name', screen_name).replace('$cursor', str(cursor)).replace('$count', str(count))
            client = client_service.get_client()
            try:
                resp, content = client.request(t_url, method='GET', body='', headers=None)
            except Exception as exp:
                print exp.message

            result = json.loads(content)
            if 'errors' in result:
                client_service.remove_client(client)
                client = client_service.get_client()
                resp, content = client.request(t_url, method='GET', body='', headers=None)
                result = json.loads(content)
            if 'next_cursor' not in result:
                break
            if result['next_cursor'] == cursor:
                break

            friends = result['users']
            for friend in friends:
                create_at = str(friend['created_at'])
                create_date = datetime.datetime.strptime(create_at, '%a %b %d %H:%M:%S +0000 %Y')
                create_date_txt = create_date.strftime('%Y-%m-%d %H:%M:%S')
                worksheet.write(row_count, 0, friend['name'])
                worksheet.write(row_count, 1, friend['screen_name'])
                worksheet.write(row_count, 2, str(friend['description']).strip().replace('\n', ''))
                worksheet.write(row_count, 3, str(friend['location']))
                worksheet.write(row_count, 4, friend['statuses_count'])
                worksheet.write(row_count, 5, friend['friends_count'])
                worksheet.write(row_count, 6, friend['followers_count'])
                worksheet.write(row_count, 7, friend['favourites_count'])
                worksheet.write(row_count, 8, create_date_txt)

                row_count += 1

            if result['next_cursor'] == 0:
                break
            cursor = result['next_cursor']
            print cursor

        workbook.close()

    @staticmethod
    def read_user_rltime_followings(screen_name):
        parent_dir = os.path.join(os.path.dirname(__file__), 'download')
        if not os.path.exists(parent_dir):
            os.makedirs(parent_dir)

        filename = os.path.join(parent_dir, screen_name + '_followings.xlsx')
        workbook = xlsxwriter.Workbook(filename=filename)
        worksheet = workbook.add_worksheet(name='Sheet1')

        url = 'https://api.twitter.com/1.1/friends/list.json?screen_name=$screen_name&cursor=$cursor&count=$count&skip_status=true&include_user_entities=false'
        cursor, count, row_count = -1, 200, 0
        while True:
            t_url = url.replace('$screen_name', screen_name).replace('$cursor', str(cursor)).replace('$count', str(count))
            client = client_service.get_client()
            try:
                resp, content = client.request(t_url, method='GET', body='', headers=None)
            except Exception as exp:
                print exp.message

            result = json.loads(content)
            if 'errors' in result:
                client_service.remove_client(client)
                client = client_service.get_client()
                try:
                    resp, content = client.request(t_url, method='GET', body='', headers=None)
                except Exception as exp:
                    print exp.message
                result = json.loads(content)
            if 'next_cursor' not in result:
                break
            if result['next_cursor'] == cursor:
                break

            friends = result['users']
            for friend in friends:
                create_at = str(friend['created_at'])
                create_date = datetime.datetime.strptime(create_at, '%a %b %d %H:%M:%S +0000 %Y')
                create_date_txt = create_date.strftime('%Y-%m-%d %H:%M:%S')

                worksheet.write(row_count, 0, friend['name'])
                worksheet.write(row_count, 1, friend['screen_name'])
                worksheet.write(row_count, 2, str(friend['description']).strip().replace('\n', ''))
                worksheet.write(row_count, 3, str(friend['location']))
                worksheet.write(row_count, 4, friend['statuses_count'])
                worksheet.write(row_count, 5, friend['friends_count'])
                worksheet.write(row_count, 6, friend['followers_count'])
                worksheet.write(row_count, 7, friend['favourites_count'])
                worksheet.write(row_count, 8, create_date_txt)

                row_count += 1

            if result['next_cursor'] == 0:
                break
            cursor = result['next_cursor']
            print cursor

        workbook.close()
# -*- coding:utf-8 -*-

import sys
import oauth2
import random
import threading

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)


KEY_1 = ''
SECRET_1 = ''
CONSUMER_KEY_1 = ''
CONSUMER_SECRET_1 = ''

KEY_2 = ''
SECRET_2 = ''
CONSUMER_KEY_2 = ''
CONSUMER_SECRET_2 = ''


class ClientService(object):

    client_1 = None

    client_2 = None

    client_pool = []

    def __init__(self):
        if self.client_1 is None:
            consumer_1 = oauth2.Consumer(key=CONSUMER_KEY_1, secret=CONSUMER_SECRET_1)
            token_1 = oauth2.Token(key=KEY_1, secret=SECRET_1)
            self.client_1 = oauth2.Client(consumer=consumer_1, token=token_1)

        if self.client_2 is None:
            consumer_2 = oauth2.Consumer(key=CONSUMER_KEY_2, secret=CONSUMER_SECRET_2)
            token_2 = oauth2.Token(key=KEY_2, secret=SECRET_2)
            self.client_2 = oauth2.Client(consumer=consumer_2, token=token_2)

        self.client_pool = [self.client_1, self.client_2]

    def add_client(self, current_client):
        self.client_pool.append(current_client)

    def remove_client(self, current_client):
        self.client_pool.remove(current_client)
        timer = threading.Timer(900, self.add_client, (current_client,))
        timer.start()

    def get_client(self):
        if self.client_pool is None or len(self.client_pool) == 0:
            raise Exception('temporarily unavailable clients! rate limit exceeded')
        return self.client_pool[random.randint(0, len(self.client_pool) - 1)]

 DouBan

# -*- coding:utf-8 -*-

import os
import re
import sys
import json
import jieba
import requests
import pandas as pd
from bs4 import BeautifulSoup
from matplotlib import pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)

# HEADER
header = {
    'Host': 'movie.douban.com',
    'Referer': 'https://movie.douban.com/subject/',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}

# PROXY
proxies = [
    {'http': '140.143.96.216:80', 'https': '140.143.96.216:80'},
    {'http': '119.27.177.169:80', 'https': '119.27.177.169:80'},
    {'http': '221.7.255.168:8080', 'https': '221.7.255.168:8080'}
]


def movie_recommend_demo_spider():
    url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0'
    resp = requests.get(url)
    if resp.ok:
        result = json.loads(resp.content)
        for subject in result['subjects']:
            print subject['id']
            print subject['title']
            print subject['url']
            print subject['rate']
            print subject['is_new']


def movie_subject_demo_spider():
    url = 'https://movie.douban.com/subject/26374197/'
    resp = requests.get(url)
    if resp.ok:
        html = BeautifulSoup(resp.content, 'html.parser')
        print html.select('div.rating_sum span')[0].text
        for t in html.select('div.indent span'):
            if t.has_attr('property'):
                prop = t.get('property')
                if prop == 'v:summary':
                    print str(t.text).strip()

        for s_tag in html.select('script'):
            if s_tag.has_attr('type'):
                type_txt = s_tag.get('type')
                if type_txt == 'application/ld+json':
                    info = json.loads(s_tag.text)
                    print info['name']
                    print info['director']
                    print info['author']


def movie_subject_comment_demo_spider():
    s_url = 'https://movie.douban.com/subject/26374197/comments?start=$start&limit=20&sort=new_score&status=P'
    t_url = 'https://movie.douban.com/subject/26374197/comments?start=$start&limit=20&sort=time&status=P'
    resp = requests.get(t_url)
    if resp.ok:
        html = BeautifulSoup(resp.content, 'html.parser')
        comment_div_tags = html.select('div.comment')
        for comment_div_tag in comment_div_tags:
            comment_id = comment_div_tag.select('h3 span.comment-vote input')[0].get('value')
            comment_votes = comment_div_tag.select('h3 span.comment-vote span')[0].text
            comment_user_tag = comment_div_tag.select('h3 span.comment-info a')[0]
            comment_user_name = comment_user_tag.text
            comment_user_profile = comment_user_tag.get('href')
            # 力薦 5 推薦 4 還行 3 較差 2 很差 1
            comment_user_rating_txt = comment_div_tag.select('h3 span.comment-info span')[1].get('title')
            comment_user_rating = 5 if comment_user_rating_txt == '力薦' else 4 if comment_user_rating_txt == '推薦' else 3\
                if comment_user_rating_txt == '還行' else 2 if comment_user_rating_txt == '較差' else 1
            comment_time = comment_div_tag.select('h3 span.comment-info span.comment-time')[0].get('title')
            comment_text = comment_div_tag.select('p span.short')[0].text
            print comment_id
            print comment_votes
            print comment_user_name
            print comment_user_profile
            print comment_user_rating
            print comment_time
            print comment_text


def movie_comment_spider():
    url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0'
    resp = requests.get(url)
    if resp.ok:
        result = json.loads(resp.content)
        for subject in result['subjects']:
            subject_id = subject['id']
            subject_title = subject['title']
            print '%s %s' % (subject_id, subject_title)
            subject_url = subject['url']
            subject_resp = requests.get(subject_url)
            if subject_resp.ok:
                html = BeautifulSoup(subject_resp.content, 'html.parser')
                subject_comment_count = html.select('div.rating_sum span')[0].text
                print subject_comment_count
                subject_short_comment_count_txt = html.select('#comments-section div.mod-hd h2 a')[0].text
                subject_short_comment_count = re.findall('\d+', subject_short_comment_count_txt)[0]
                print subject_short_comment_count
                movie_subject_comment_spider(subject_id, subject_title, int(subject_short_comment_count))
            else:
                print subject_resp.content
    else:
        print resp.content

def movie_subject_comment_spider(subject_id, subject_title, subject_short_comment_count):
    t_url = 'https://movie.douban.com/subject/$subject_id/comments?start=$start&limit=20&sort=time&status=P'.replace('$subject_id', subject_id)
    column1 = []
    column2 = []
    column3 = []
    column4 = []
    column5 = []
    column6 = []
    column7 = []
    #for i in range((subject_short_comment_count / 20) + 1):
    for i in range(10):
        resp = requests.get(t_url.replace('$start', str(20 * i)), headers=header)
        if resp.ok:
            html = BeautifulSoup(resp.content, 'html.parser')
            comment_div_tags = html.select('div.comment')
            for comment_div_tag in comment_div_tags:
                comment_id = comment_div_tag.select('h3 span.comment-vote input')[0].get('value')
                comment_votes = comment_div_tag.select('h3 span.comment-vote span')[0].text
                comment_user_tag = comment_div_tag.select('h3 span.comment-info a')[0]
                comment_user_name = comment_user_tag.text
                comment_user_profile = comment_user_tag.get('href')
                # 力薦 5 推薦 4 還行 3 較差 2 很差 1
                comment_user_rating_txt = comment_div_tag.select('h3 span.comment-info span')[1].get('title')
                comment_user_rating = 5 if comment_user_rating_txt == '力薦' else 4 if comment_user_rating_txt == '推薦' else 3 \
                    if comment_user_rating_txt == '還行' else 2 if comment_user_rating_txt == '較差' else 1
                comment_time = comment_div_tag.select('h3 span.comment-info span.comment-time')[0].get('title')
                comment_text = comment_div_tag.select('p span.short')[0].text
                column1.append(comment_id)
                column2.append(comment_user_name)
                column3.append(comment_user_profile)
                column4.append(comment_user_rating)
                column5.append(comment_votes)
                column6.append(comment_time)
                column7.append(str(comment_text).strip().replace(' ', '').replace('\n', '').replace('\r', ''))

    df = pd.DataFrame({'id': column1, 'name': column2, 'profile': column3, 'rating': column4,\
        'votes': column5, 'time': column6, 'text': column7})
    df.to_csv('F:\\result\\tmp\\douban\\$subject_title.csv'.replace('$subject_title', subject_title), sep=',', na_rep='NA', index=False)
    #df.to_csv('/home/ym/Project/datamining/resources/$subject_title.csv'.replace('$subject_title', subject_title), sep=',', na_rep='NA', index=False)


def movie_comment_analyze():
    df = pd.read_csv('F:\\result\\tmp\\douban\\fengkuangwaixingren.csv', names={'id', 'name', 'profile', 'rating', 'text', 'time', 'votes'})
    words = []
    for content in df['text']:
        words.extend(jieba.cut(content))
    word_txt = ' '.join(words)
    wc = WordCloud(background_color='white',
                   max_words=1000,  # 最大詞數
                   max_font_size=100,  # 顯示字體的最大值
                   #mask=back_color,  # 以該參數值作圖繪製詞雲,這個參數不爲空時,width和height會被忽略
                   width=1000,
                   height=800,
                   random_state=42,  # 爲每個詞返回一個PIL顏色
                   font_path='F:/develop/python/stfangso/STFANGSO.TTF'
                   )
    print word_txt
    wc.generate(word_txt)
    # 解析該圖片
    #back_color = imread('o_002.jpg')
    # 基於彩色圖像生成相應彩色
    #image_colors = ImageColorGenerator(back_color)
    # 顯示圖片
    plt.imshow(wc)
    # 關閉座標軸
    plt.axis('off')
    plt.show()

    #wc.to_file('F:/a.png')

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章