1.数据获取方式

xpath

/#未完成

import requests
from lxml import html as lxml_html  #pip install lxml 用于解析html
html = requests.get(url = 'http://www.baidu.com')
html.encoding = 'utf-8'
html = html.text
doc = lxml_html.fromstring(html)
title = doc.xpath('这里是xpath规则')

>>>title
['匹配的内容']

正则

最常用的 (.*?) 非贪婪匹配（.*?）默认匹配所有如果遇到换行问题 ([\s\S]*) 贪婪匹配
例：

import re
string = '{"vm_type":"kvm","ve_status":"running","ve_mac1":"***********","ve_used_disk_space_b":5088407552,"ve_disk_quota_gb":"10","is_cpu_throttled":"","ssh_port":29364,"live_hostname":"ubuntu","load_average":"0.00 0.00 0.00 1\\/167 27303","mem_available_kb":345856,"swap_total_kb":135164,"swap_available_kb":100160,"hostname":"localhost.localdomain","node_ip":"**********","node_alias":"v7415","node_location":"US, California","node_location_id":"USCA_3","node_datacenter":"US: Los Angeles, California (DC3 CN2)","location_ipv6_ready":false,"plan":"kvmv3-10g-512m-500m-ca-cn2","plan_monthly_data":536870912000,"monthly_data_multiplier":1,"plan_disk":10737418240,"plan_ram":536870912,"plan_swap":0,"plan_max_ipv6s":0,"os":"ubuntu-16.04-x86_64","email":"[email protected]","data_counter":14346176667,"data_next_reset":1532495173,"ip_addresses":["**********"],"rdns_api_available":true,"ptr":{"********":null},"suspended":false,"error":0,"veid":938308}'
re_plan = '"plan":"(.*?)"'
plan = re.findall(re_plan,string)

>>>plan
['kvmv3-10g-512m-500m-ca-cn2']


re_plan = '"plan":"([\s\S]*)"'
plan = re.findall(re_plan,string)

>>>plan
['kvmv3-10g-512m-500m-ca-cn2","plan_monthly_data":536870912000,"monthly_data_multiplier":1,"plan_disk":10737418240,"plan_ram":536870912,"plan_swap":0,"plan_max_ipv6s":0,"os":"ubuntu-16.04-x86_64","email":"[email protected]","data_counter":14346176667,"data_next_reset":1532495173,"ip_addresses":["**********"],"rdns_api_available":true,"ptr":{"********":null},"suspended":false,"error":0,"veid']

css

暂时还没有接触过。。。

2.string 转 json

import json
from pprint import pprint
string = '{"vm_type":"kvm","ve_status":"running","ve_mac1":"***********","ve_used_disk_space_b":5088407552,"ve_disk_quota_gb":"10","is_cpu_throttled":"","ssh_port":29364,"live_hostname":"ubuntu","load_average":"0.00 0.00 0.00 1\\/167 27303","mem_available_kb":345856,"swap_total_kb":135164,"swap_available_kb":100160,"hostname":"localhost.localdomain","node_ip":"**********","node_alias":"v7415","node_location":"US, California","node_location_id":"USCA_3","node_datacenter":"US: Los Angeles, California (DC3 CN2)","location_ipv6_ready":false,"plan":"kvmv3-10g-512m-500m-ca-cn2","plan_monthly_data":536870912000,"monthly_data_multiplier":1,"plan_disk":10737418240,"plan_ram":536870912,"plan_swap":0,"plan_max_ipv6s":0,"os":"ubuntu-16.04-x86_64","email":"[email protected]","data_counter":14346176667,"data_next_reset":1532495173,"ip_addresses":["**********"],"rdns_api_available":true,"ptr":{"********":null},"suspended":false,"error":0,"veid":938308}'
j_son = json.loads(string)
pprint(j_son) #能够带格式输出

3.关于一些常见编码问题通用方案

import requests
from pprint import pprint
url = "http://www.baidu.com"
html = requests.get(url)
html.encoding = 'utf-8'
pprint(html.text)

大致效果

4.logging

为什么要用logging ：

我记得有个皮皮怪这么回答一个计算式返回结果计算耗时1% print 耗时 99%

当然，写python随心就行，但是大项目还是得用logging来做下记录的，小项目 print + file wirte 也没的事

import logging
import requests

class Project(object):
    def __init__(self):
        #如果你觉得有些无关紧要的logging太碍眼  比方说 requests 的url请求的logging记录等，可以通过下行代码（通过设置logging的等级 ）
        #logging.getLogger("requests").setLevel(logging.WARNING)
        self.logger = logging.getLogger()
        handler = logging.StreamHandler()
        formatter = logging.Formatter('%(asctime)s [%(threadName)s][%(levelname)s] %(message)s') 
        handler.setFormatter(formatter)
        self.logger.addHandler(handler)
        self.logger.setLevel(logging.DEBUG)#这里设置logging的等级

具体到某一步的logging记录！！！注意不管是debug还是 info还是warning 括号内把必须是字符串

self.logger.debug("这里来记录一些内容") #不局限于debug 还有 info 、warning 括号内 type = string

5.关于一些比较常用的字符串处理

字符串去除空格换行符

string.strip()

字符串分割

#比如以':'为界分割
string.split(':')

字符串替换

string = "w:w:m"
a=string.replace(':','/')

>>>a
'w/w/m'

6.python切片

为什么要讲切片
当我们的爬虫获取到信息时候，不可能是十全十美的，总是会有残缺的情况的
比如：
我们应该拿到的完整的信息是这个样子的

info = {
'title':'titel_content',
'content':['1','2','3'],
'tags':['life','love'],
'author':['name:horsun','introduce:~~~~~~~~']# 比如 author是的list 长度为2 
}

但是实际上我们拿到了这个样子的：

info = {
'title':'titel_content',
'content':['1','2','3'],
'tags':['life','love'],
'author':['name:horsun']#但是 有些内容缺失 导致只有一个长度
}

所以当我们要获取introduce的时候应该是info['author'][1] 但是当我们的数据出现第二种情况的时候 info['author'][1] 就会抛错 list长度错误但是我们可以通过切片获取 info['author'][1]的对象 info['author'][0:1] #获取的是info['author'][1] 但是这样返回的是一个list ------>['name:horsun'] info['author'][0:1] 第一个0 是起始位置第二个1 是终止位置类似数学的区间 [4,8)---->4,5,6,7 所以我们可以通过判断 info['author'][1:2] 的长度 ---->info['author'][1:2].__len__() ==0


info = {
'title':'titel_content',
'content':['1','2','3'],
'tags':['life','love'],
'author':['name:horsun']#但是 有些内容缺失 导致只有一个长度
}
article = Article()
article.create(
title = info['title'],
content= info['content'],
tags= info['tags'],
author_name= info['author'][0]   if info['author'][0:1].__len__() !=0 else ' ',
author_intro= info['author'][1]   if info['author'][1:2].__len__() !=0 else ' ',
)

if info['author'][1:2].__len__() !=0 else '' 关于这句就是如果info['author'][1:2].__len__() !=0 成立就执行 author_name= info['author'][0] 否则就执行 author_name=' '

7.一些常见的反反爬手段

设置headers 即请求头
-基本上的url设置一个UA(User-Agent)就行了，除非一些特例要完全按照抓包的请求 headers来
例：headers2 = { 'Host': 'jwxt.zwu.edu.cn', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit /537.36(KHTML, like Gecko)Chrome / 67.0.3396.62 Safari / 537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Referer': 'http://jwxt.zwu.edu.cn/xs_main.aspx?xh=2014014701', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,en;q=0.9,zh-CN;q=0.8',
time sleep #注意不要固定一个sleep时间可以 random 否则也会判定你是机器人（哪有人能控制时间这么准确的）
例：

import time
import random

time.sleep(random.randint(5, 10))

设置代理（效率最高）# requests为例

import requests
url = 'http://www.baidu.com'
proxies = {
    "http":"110.110.110.110:110"   #"协议"+：+"ip+端口号"
}
html = requests.get(url,proxies = proxies)

通过 selenium 来爬（万能,但是效率较低）
有代理谁用 selenium
以及一些携带奇葩的post参数
记录登陆 session
下面讲一个 asp.NET的奇葩post参数例子
大学校园教务网个人成绩页查询页爬虫。页面是asp.net编写的。
·——————————————————·
先通过登陆页面 url1 登陆来记录session，然后请求成绩查询页面（还没有数据的页面） url2 ，请求查询全部成绩 url3。其实讲道理，一般的网站，你记录了登陆session，所有页面都是随心所欲的，所已url2也能正常访问，但是当我打算获取我的所有成绩的时候，通过请求url3发现，请求失败，并没有出现我想要的成绩数据，通过抓包发现，其实url2和url3 是同一个url，但是url2 是get方法，url3 是post方法，发现 url3 在post的时候发送了一个 '__VIEWSTATE' 和一个 '__VIEWSTATEGENERATOR'参数，通过网上查找发现这俩个参数其实是通过url2的get请求插在了html中在url3发送post请求的时候携带了了这俩个参数，服务器接受请求的时候验证了这俩个参数才给相应

`注意，这俩个url请求的headers也不完全相同，一定要按照抓包的时候请求头 headers来请求`

代码片段

    def get_response_data(self):
        """
        通过第一次访问 self.url 来获取 第二次访问 self.url的的所需参数
        第一次 访问self.url 是get方法
        第二次 访问self.url 是post方法 post带了写 data 所需参数 需要从第一次访问所返回的html中找到
        (第一次是get方法 请求返回后 html 内有__VIEWSTATE 和__VIEWSTATEGENERATOR 俩大参数)
        :return:
        """
        response = self.session.get(url=self.url4.format(self.student_id),
                                    headers=self.headers2,
                                    )
        response.encoding = 'gb2312'
        html = response.text
        __VIEWSTATE = re.findall('name="__VIEWSTATE" value="(.*?)"', html)
        __VIEWSTATEGENERATOR = re.findall('name="__VIEWSTATEGENERATOR" value="(.*?)"', html)
        self.data = {
            '__VIEWSTATE': ''.join(__VIEWSTATE),
            '__VIEWSTATEGENERATOR': ''.join(__VIEWSTATEGENERATOR),
            'ddlXN': '',
            'ddlXQ': '',
            'Button1': '%B0%B4%D1%A7%C6%DA%B2%E9%D1%AF', }

    def get_score(self):
        """
        获取目标url的html内容
        接下来就可以对目标页面进行解析
        可以通过正则或者xpath来提取数据
        :return:
        """
        cookies = self.session.cookies
        response = self.session.post(url=self.url4.format(self.student_id),
                                     data=self.data,
                                     headers=self.score_headers,
                                     cookies=cookies
                                     )
        response.encoding = 'gb2312'
        html = response.text
        print(html)#这一步就获取到了所有的页面数据信息了 ✌
        doc = lxml_html.fromstring(html)

完整代码：https://github.com/helloworld19951213/get_my_school_scroe/blob/master/spider.py

8.增量爬取/去重（重复爬取）/更新爬取

去重和增量爬取主要思想就是把已经爬过的数据放到列表里面，通过判断url或者别的参数是否再列表中来决定是否爬取

详细见我另一篇文章 https://blog.csdn.net/qq_33042187/article/details/78929834
——————————————————分割线————————————————————————————

待补充 2018/7/4

TODO
-> ~~应对常见的反爬有效手段~~ 7.5
->~~数据去重~~
->~~增量爬取~~
->多线程爬虫以及多线程+队列实现线程间的通信
->一些抓包手段
待补充

python爬虫汇总

1.数据获取方式

xpath

正则

css

2.string 转 json

3.关于一些常见编码问题通用方案

4.logging

为什么要用logging ：

当然，写python随心就行，但是大项目还是得用logging来做下记录的，小项目 print + file wirte 也没的事

5.关于一些比较常用的字符串处理

字符串去除空格换行符

字符串分割

字符串替换

6.python切片

7.一些常见的反反爬手段

`注意，这俩个url请求的headers也不完全相同，一定要按照抓包的时候请求头 headers来请求`

8.增量爬取/去重（重复爬取）/更新爬取

待补充 2018/7/4

django人性化設置時間改爲多久前

JWT失效方式---------django rest framework jwt

初識約瑟夫環--python

python爬蟲彙總

list of dict 轉換成 dict of list 字典形列表轉換列表形字典 in Python

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

python爬虫汇总

1.数据获取方式

xpath

正则

css

2.string 转 json

3.关于一些常见编码问题通用方案

4.logging

为什么要用logging ：

当然，写python随心就行，但是大项目还是得用logging来做下记录的，小项目 print + file wirte 也没的事

5.关于一些比较常用的字符串处理

字符串去除 空格 换行符

字符串分割

字符串替换

6.python切片

7.一些常见的反 反爬 手段

注意，这俩个url请求的headers也不完全相同，一定要按照抓包的时候请求头 headers来请求

8.增量爬取/去重（重复爬取）/更新爬取

待补充 2018/7/4

字符串去除空格换行符

7.一些常见的反反爬手段

`注意，这俩个url请求的headers也不完全相同，一定要按照抓包的时候请求头 headers来请求`