[Python爬蟲] 3-數據解析(lxml/bs4/正則)

# I.Xpath語法和lxml模塊
# 1)Xpath語法
# 1.概念:XPath是一門在XML/HTML文檔中查找信息的語言
# 2.工具:Chrome的XPath Helper和Firefox的XPath Checker
# 3.語法:使用//獲取整個頁面當中的元素,然後寫標籤名,然後寫謂詞進行提取,例://div[@class="abc"]
# 4.幾個注意點:
    # i.//子孫節點,/子節點,@屬性
    # ii.contains:某個屬性中出現了多個值,可以使用contains函數,例://div[contains(@class,"job_detail")]
    # iii.謂詞的第一個下標爲1
    
# 2)lxml模塊
# 1.概念:lxml是一個基於C語言的XML/HTML的解析器,用於解析和提取XML/HTML數據
# 2.編碼
# 2-1.解析html字符串,使用lxml.etree.HTML進行解析(str-> lxml.etree._Element-> bytes)
from lxml import etree
text = """
<!-- hello.html -->
<div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
"""
htmlElement = etree.HTML(text)  # 利用etree.HTML,將字符串解析爲HTML文檔
result = etree.tostring(htmlElement,encoding="utf-8")  # 按字符串序列化HTML文檔
print(result.decode("utf-8"))
# 2-2.解析html文件,使用lxml.etree.parse進行解析
htmlElement = etree.parse("hello.html")
result = etree.tostring(htmlElement,encoding="utf-8")
print(result.decode("utf-8"))
# !!!非常重要!!!這個函數默認使用XML解析器,所以如果碰到一些不規範的html代碼時就會解析錯誤,這時候需要自己指定一個HTML解析器
# parser = etree.HTMLParser(encoding="utf-8")
# htmlElement = etree.parse("tencent.html",parser=parser)
# result = etree.tostring(htmlElement,encoding="utf-8")
# print(result.decode("utf-8"))

# 3)lxml結合xpath的使用
# tencent.html見附錄1
from lxml import etree
parser = etree.HTMLParser(encoding="utf-8")
html = etree.parse("tencent.html",parser=parser)
# print(html)
# 1.獲取所有的tr標籤
# !!!重要!!!xpath函數返回的是一個列表,對於列表[]而言,想要取單個元素可以在列表後面加[]這樣索引的方式選取
trs = html.xpath("//tr")
# for tr in trs:
    # print(etree.tostring(tr,encoding="utf-8").decode("utf-8"))
    # break
# 2.獲取第2個tr標籤
tr = html.xpath("//tr[2]")[0]
# print(etree.tostring(tr,encoding="utf-8").decode("utf-8"))
# 3.獲取所有class等於even類的標籤
trs = html.xpath("//tr[@class='even']")
# for tr in trs:
#     print(etree.tostring(tr,encoding="utf-8").decode("utf-8"))
# 4.獲取所有a target的href屬性
# a標籤位置(第一個td下有一個子標籤a)href記錄了部分url,可補充成完整的url
list = html.xpath("//td/a/@href") # 獲取所有的a標籤
# 注意不是html.xpath("//a[@href]"),這個表示所有包含href的a標籤,而不是需求中的所有a標籤的href
positions = []
for i in list:
    url = "https://hr.tencent.com/" + i
    # print(url)
# 5.獲取所有的職位信息(純文本)
# 分析:所有的信息均在tr標籤下,且要排除第一個(擡頭)和最後一個(其他不需要的信息)
trs = html.xpath("//tr[position()>1 and position()<11]")
for tr in trs:
    # !!!重要!!!在某個標籤下,再執行xpath函數,獲取這個標籤下的子孫元素,那麼應該在//之前加"."代表在當前元素下獲取
    href = tr.xpath(".//a/@href")[0]
    url = "https://hr.tencent.com/" + href
    # 用text()函數可獲取某個標籤下的所有文本
    # 這裏text()前要//,因爲文本並不是直接在td標籤下,而是在td的子標籤a下
    title = tr.xpath("./td[1]//text()")[0]
    category = tr.xpath("./td[2]//text()")[0]
    numbers = tr.xpath("./td[3]//text()")[0]
    area = tr.xpath("./td[4]//text()")[0]
    pubdate = tr.xpath("./td[5]//text()")[0]
    # 拼接爲字典,添加到列表中
    position = {"url:":url,"title:":title,"category":category,
                "numbers":numbers,"area":area,"pubdate:":pubdate}
    positions.append(position)
print(positions)

# 4)豆瓣電影爬蟲
# 1.將目標網站上的頁面抓取下來
import requests
url = "https://movie.douban.com/cinema/nowplaying/hangzhou/"
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
           "Referer":"https://movie.douban.com/"}
response = requests.get(url=url,headers=headers)
text = response.text
# print(text)
# response.text:返回一個經過解碼後的字符串,是str(unicode)類型
# response.content:返回一個原生的字符串,是bytes類型
# 2.將抓取下來的數據根據一定的規則進行提取
from lxml import etree
html = etree.HTML(text)
ul = html.xpath("//ul[@class='lists']")[0]
# print(etree.tostring(ul,encoding="utf-8").decode("utf-8"))  # 將ul編碼爲utf-8的形式->轉換爲字符串->解碼爲utf-8
lis = ul.xpath("./li")
movies = [] # 空列表用於放置字典
for li in lis:
    title = li.xpath("@data-title")[0]
    score = li.xpath("@data-score")[0]
    duration = li.xpath("@data-duration")[0]
    region = li.xpath("@data-region")[0]
    director = li.xpath("@data-director")[0]
    actors = li.xpath("@data-actors")[0]
    poster = li.xpath(".//img/@src")[0]
    movie = {"title":title,"score":score,"duration":duration,
             "region":region,"director":director,"actors":actors,"poster":poster}
    movies.append(movie)
print(movies)

# 5)陽光電影爬蟲
# 1.使用的庫
import requests
from lxml import etree
# 2.設定全局變量
HEADERS = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
          "Referer":"http://www.ygdy8.net/index.html"}
HEADER_URL = "http://www.ygdy8.net/"
# 3.獲取各個頁面的url,並將各個url導入get_detail_urls函數中
def spider():
    base_url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html" # 頁面url的框架
    movies = [] # 創建一個list用來存放解析好的內容(即movie = {})
    for i in range(1,2): # 此處選取前11頁
        page_urls = base_url.format(i) # 獲取各個頁面的url
        detail_urls = get_detail_urls(page_urls) # 將各個page_url導入get_detail_urls函數中
        for detail_url in detail_urls: # 遍歷所有頁中的detail_urls,依次將detail_url導入parse_detail_urls中進行解析
            movie = parse_detail_urls(detail_url)
            movies.append(movie)
# 4.得到page_urls後獲取對應頁所有電影的detail_urls
def get_detail_urls(page_urls):
    response = requests.get(url=page_urls,headers=HEADERS)
    text = response.content.decode("gbk","ignore")
    # 注1:此處不能使用.text,原因在於response.text命令會默認選擇一個解碼方式對網頁進行解碼,但網頁的編碼方式不唯一,此處的話網頁源代碼可以看出"charset=gb2312",爲gbk的一種。
    # 注2:此處加上"ignore"放置報錯('gbk' codec can't decode byte 0xd0 in position 30352: illegal multibyte sequence)
    html = etree.HTML(text=text) # 將text解析成element形式
    tail_urls = html.xpath("//table[@class='tbspan']//a/@href") # 得到的爲完整url的尾部,用map函數對其進行拼接
    # 注3:xpath語句解讀:選取包含class="tbspan"屬性的table標籤下的子孫元素爲a(//)的href(@)屬性值
    detail_urls = map(lambda tail_url:HEADER_URL+tail_url,tail_urls) # map函數:http://www.runoob.com/python/python-func-map.html
    return detail_urls
# 5.解析各個detail_urls下的具體內容
def parse_detail_urls(detail_url):
    movie = {} # 創建一個dict用來存放各項屬性
    response = requests.get(url=detail_url,headers=HEADERS)
    text = response.content.decode("gbk","ignore")
    html = etree.HTML(text=text) # 將text解析成element形式
    # 1.標題
    title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
    movie["title"] = title
    # 注1:發現有多個結果可以往上面增加一級條件
    # 注2:利用text()函數獲取屬性下的所有文本內容
    # 2.海報,在Zoom層下,xpath返回的爲list
    zoom = html.xpath("//div[@id='Zoom']")[0]
    imgs = zoom.xpath(".//img/@src") # 此處獲取到2張,分別對應了海報和截圖
    poster = imgs[0] # 有些有截圖有些無截圖,統一取第一張
    movie["poster"] = poster
    # 3.其他信息
    infos = zoom.xpath("//div[@id='Zoom']//text()") # 此處的每項標籤對應了列表中的一行
    for index,info in enumerate(infos): # 遍歷是採用enumerate,得到對應的索引和內容,目的是爲了篩選actor部分
        if info.startswith("◎年  代"): # 如果以"◎年  代"開頭
            info = info.replace("◎年  代", "").strip()  # 將年代替換成空格,並用strip()函數清空前後的空格
            movie["year"] = info
        elif info.startswith("◎產  地"):
            info = info.replace("◎產  地","").strip()
            movie["country"] = info
        elif info.startswith("◎類  別"):
            info = info.replace("◎類  別","").strip()
            movie["category"] = info
        elif info.startswith("◎豆瓣評分"):
            info = info.replace("◎豆瓣評分","").strip()
            movie["score"] = info
        elif info.startswith("◎片  長"):
            info = info.replace("◎片  長","").strip()
            movie["duration"] = info
        elif info.startswith("◎導  演"):
            info = info.replace("◎導  演","").strip()
            movie["director"] = info
        elif info.startswith("◎導  演"):
            info = info.replace("◎導  演","").strip()
            movie["director"] = info
        elif info.startswith("◎主  演"):
            info = info.replace("◎主  演","").strip()
            actors = [info] # 主演這一行的那一個
            for i in range(index+1,len(infos)):
                actor = infos[i].strip() # 注意,不是info
                if actor.startswith("◎"): # 如果碰到◎,則停止
                    break
                actors.append(actor)
            movie["actor"] = actors
        elif info.startswith("◎簡  介"):
            info = info.replace("◎簡  介","").strip()
            profiles = [info]
            for j in range(index+1,len(infos)):
                profile = infos[j].strip()
                if profile.startswith("【"):
                    break
                profiles.append(profile)
            movie["profile"] = profiles
    download_urls = zoom.xpath(".//td[@bgcolor='#fdfddf']//a/@href")[0]
    movie["download"] = download_urls
    print(movie)
if __name__ == '__main__':
    spider()
# II.BeautifulSoup4庫
# 1)基本介紹
# 1.概念:和lxml一樣,也是一個HTML/XML的解析器,lxml只會局部遍歷,而BeautifulSoup基於HTML DOM,會載入整個文檔,相較於lxml更加容易
# 2.官方文檔:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html

# 2)基本使用
# 2-1.find_all及find方法
# html爲附錄1
from bs4 import BeautifulSoup
html = """..."""
soup = BeautifulSoup(html,"lxml") # 第二個參數爲解析器,通常使用Lxml
# 1.獲取所有tr標籤
trs = soup.find_all("tr")
# 2.獲取第二個tr標籤,用limit參數確定獲取元素的個數,類似sql的操作,返回一個列表
tr = soup.find_all("tr",limit=2)[1]
# 3.獲取所有class="even"的標籤,class爲python關鍵字,需要加一個_做出區分
trs = soup.find_all("tr",class_="even")
# 或者可以這樣寫,attrs爲atrribute的簡寫,要寫成字典的形式
trs = soup.find_all("tr",attrs={"class":"even"})
# 4.將所有class="right pl9"且id="topshares"的div標籤提取出來,多個判斷條件的寫法
divs = soup.find_all("div",class_="right pl9",id="topshares")
# 同樣可以通過attrs來寫
divs = soup.find_all("div",attrs={"class":"right pl9","id":"topshares"})
# 5.獲取所有a標籤的href屬性,提取某個標籤下的某個屬性
aList = soup.find_all("a")
for a in aList:
    # 1.通過下標的方式(推薦使用這種)
    href = a["href"]
    # 2.通過attrs屬性獲取
    href = a.attrs["href"]
# 6.獲取所有職位信息,如果只要該標籤下的字符串可以用.string獲取,同xpath.text()
# !!重要!!且用.string屬性來提取標籤裏的內容時,該標籤應該是隻有單個節點的。比如上面的 <td>1</td> 標籤那樣,不然會顯示None,在html裏找出來剔除即可
trs = soup.find_all("tr")[1:10]
works = []
for tr in trs:
    # 1.方法一
    work = {}
    tds = tr.find_all("td")
    work_name = tds[0].string
    work["work_name"] = work_name
    works.append(work)
    # 2.方法二(巨特麼方便,用stripped_strings剔除掉其中的空格)
    infos = list(tr.stripped_strings)
    work_name = infos[0]
    work["work_name"] = work_name
    works.append(work)
    
# 2-2.BeautifulSoup中使用CSS選擇器
# i.CSS常用語法:1.根據標籤a查找 a 2.根據b類查找 .b 3.根據id=c查找 #c 4.查找d類子孫元素p #d p 5.直接查找d類子元素p #d>p
soup = BeautifulSoup(html,"lxml")
# 1.獲取所有的tr標籤,CSS選擇器對應語法:tr
trs = soup.select("tr")
# 2.獲取第2個tr標籤
tr = soup.select("tr")[1]
# 3.獲取所有class="even"的tr標籤:CSS選擇器對應語法:tr.even
tr = soup.select("tr.even")
tr = soup.select("tr[class='even']")
# 4.將所有class="right pl9"且id="topshares"的div標籤提取出來,多個判斷條件的寫法
# CSS選擇器無法實現
# 5.獲取所有a標籤的href屬性
aList = soup.select("a")[57:67]
for a in aList:
     href = a["href"]
# 6.提取所有職位信息
works = []
work = {}
trs = soup.select("tr")[1:11]
for tr in trs:
    infos = list(tr.stripped_strings)
    work_name = infos[0]
    work_name = infos[0]
    work["work_name"] = work_name
    works.append(work)

# 3)常用對象
# 1.Tag:BeautifulSoup中所有的標籤都是Tag類型,並且BeautifulSoup的對象本質上也是一個Tag類型,所以其實一些方法比如find,find_all並不是BeautifulSoup的,而是Tag的
# 2.BeautifulSoup:繼承自Tag,用來生成BeautifulSoup樹,對於一些查找方法,比如find,find_all,select這些,其實還是Tag的
# 3.NavigableString:繼承自python中的str,用起來和str是一樣的
# 4.Comment:繼承自NavigableString

# 4)中國天氣網的爬蟲及數據可視化
from bs4 import BeautifulSoup
import requests
ALL_DATA = []
def spider():
    # I.得到各各頁URL後的批量爬蟲
    urls = ["http://www.weather.com.cn/textFC/hb.shtml",
            "http://www.weather.com.cn/textFC/hd.shtml",
            "http://www.weather.com.cn/textFC/hz.shtml",
            "http://www.weather.com.cn/textFC/hn.shtml",
            "http://www.weather.com.cn/textFC/xb.shtml",
            "http://www.weather.com.cn/textFC/xn.shtml",
            "http://www.weather.com.cn/textFC/gat.shtml"]
    for url in urls:
        get_weather_condition(url)
    # II.數據可視化部分
    from pyecharts import Bar
    ALL_DATA.sort(key=lambda data:data["min_temp"])
    data = ALL_DATA[0:10]
    cities = list(map(lambda x:x["city"],data))
    min_temps = list(map(lambda x:x["min_temp"],data))
    chart = Bar("min_temp")
    chart.add("",cities,min_temps)
    chart.render("test.html")
def get_weather_condition(url): # 單頁的爬蟲
    # 設定請求頭
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
               "Referer":"http: // www.weather.com.cn / textFC / hd.shtml"}
    # 請求網頁內容
    response = requests.get(url=url,headers=headers)
    text = response.content.decode("utf-8")
    # 解碼
    soup = BeautifulSoup(text,"html5lib") # 此處用html5lib的原因在於其中有一頁的html格式不是很規範,需要用這個方法增加容錯性
    conMidtab = soup.find_all("div",class_="conMidtab")[0] # 此處有7頁conMidtab,後6頁被隱藏,實則爲之後幾天的天氣情況,此處只找今天一天的天氣情況
    tables = conMidtab.find_all("table") # 發現各個省的天氣信息放在各個table下,遍歷各個table
    for table in tables:
        trs = table.find_all("tr")[2:] # 去掉table擡頭兩行
        for index,tr in enumerate(trs): # 每行的第一個身份會出現兩次,需要做判斷
            infos = list(tr.stripped_strings)
            if index == 0:
                city = infos[1]
                max_temp = infos[5]
                min_temp = infos[-2]
            else:
                city = infos[0]
                max_temp = infos[4]
                min_temp = infos[-2]
            ALL_DATA.append({"city":city,"max_temp":int(max_temp),"min_temp":int(min_temp)})
if __name__ == '__main__':
    spider()
# III.正則表達式
# 1)概念:按照一定的規則,從某個字符串中匹配符出想要的數據,這個規則就是正則表達式
#   區分:利用正則表達式爬蟲不同於xpath語法和bs4,使用時網頁中的內容不存在上下級的關係,案例中我還是按照了這種思路寫,其實沒有這個必要
# 2)re模塊下常用的函數
# 1.match:從一個字符串開始的位置進匹配,如果開始的位置沒有匹配到。就直接失敗了
# 2.search:在字符串中找滿足條件的字符,如果找到就返回,只會找到第一個滿足條件的
# 3.group:分組,正則表達式中可以對過濾到的字符串進行分組,分組使用()的方式
import re
text = "apple price is $10, orange price is $20"
ret = re.search(".*(\$\d+),.*(\$\d+)",text)
print(ret.group(0)) # 等同於print(ret.group())
print(ret.group(1)) # 返回()內的內容,索引從1開始
print(ret.groups()) # 依次返回各()內的內容,tuple(元組類型),和list非常類似,但是tuple一旦初始化就不能修改,更爲安全
# 4.findall:找出所有滿足條件的,返回的是一個列表
import re
text = "apple price is $10, orange price is $20"
ret = re.findall("\$\d+",text)
print(ret)
# 5.sub:用來替換字符串。將匹配到的字符串替換爲其他字符串,pattern:替換區域, repl:替換內容, string, flags=0:從什麼位置開始替換,count=0:替換個數 
import re
text = "apple price is $10, orange price is $20"
ret = re.sub("\$\d+","0",text,flags=0,count=1) # 將$10替換爲0,且$20的位置不變
print(ret)
# 6.split:分割字符串,可指定maxsplit和flags
import re
text = "hello wrold !"
ret = re.split(" ",text)
print(ret)
# 7.compile:編譯正則表達式,通過制定re.VERBOSE使其可以添加註釋,推薦!!!見例7

# 3)常用匹配規則:
import re
# 1.匹配單個字符
# 匹配某個字符串
text = "hello"
ret = re.match("he",text)
print(ret.group())
# .:匹配任意的字符
text = "hello"
ret = re.match(".",text)
print(ret.group())
# \d:匹配任意數字
text = "1234"
ret = re.match("\d",text)  # ret = re.match("[0-9],text")
print(ret.group())
# \D:匹配任意的非數字
text = "ASD213"
ret = re.match("\D",text)  # ret = re.match("[^0-9],text")
print(ret.group())
# \s:匹配空白字符,包括\n,\t,\r,空格
text = "\n"
ret = re.match("\s",text)
print(ret.group())
# \w:匹配a-z,A-Z,數字和下劃線
text = "ASDW"
ret = re.match("\w",text)  # ret = re.match("[a-zA-Z0-9_]",text)
print(ret.group())
# \W:匹配與\w相反的內容
text = "+"
ret = re.match("\W",text)  # ret = re.match("[^a-zA-Z0-9_]",text)
print(ret.group())
# []:組合的方式,只要滿足[]中的字符就可以匹配
text = "0571-888888"
ret = re.match("[\d-]+",text)
print(ret.group())
# 2.匹配多個字符
# *:匹配0個或任意多個字符串,具體是數字還是字符還是其他的坎*前面的
text = "0571"
ret = re.match("\d*",text)
print(ret.group())
# +:匹配一個或者多個字符串
text = "abcd"
ret = re.match("\w+",text)
print(ret.group())
# ?:匹配0個或者1個字符串,要麼沒有,要麼只有一個
text = "abcd"
ret = re.match("\w?",text)
print(ret.group())
# {m}:匹配m個字符
text = "abcd"
ret = re.match("\w{2}",text)
print(ret.group())
# {m,n}:匹配m-n個字符,按多的匹配
text = "abcd"
ret = re.match("\w{1,3}",text)
print(ret.group())
# 3.其他
# ^:表示以...開始,在中括號[]中使用代表取反
# $:表示以...結束
# |:匹配多個表達式或者字符串,通常用()包起來
# \:轉義符號,在正則表達式中,有些字符是有特殊意義的字符。因此如果想要匹配這些字符,那麼就必須使用反斜槓進行轉義。
# r:原生字符串
text = "\\c" # python剝掉一層\,實則爲\n
ret = re.match(r"\\c",text) # 正則表達式剝掉一層\,python中用r表原生字符串,或者ret = re.match("\\\\c",text) 
print(ret.group())
# 貪婪模式+,非貪婪模式+?:
text = "<h1>abc<h2>" # 要選取<h1>
ret = re.match("<.+?>",text) # 選取了滿足條件的最前面一個<h1>,"<.+>"的話會選取<h1>abc<h2>
print(ret.group())

# 4)小案例
# 1.驗證手機號碼(規則:第一位爲1,第二位爲34578)
text = "13303475216"
ret = re.match("1[34578]\d{9}",text)
print(ret.group())
# 2.驗證郵箱(規則:前若干位爲數字,英文或下下劃線+@符號+數字和小寫英文+.+數字和小寫英文)
text = "[email protected]"
ret = re.match("\w+@[0-9a-z]+\.[a-z]+",text) # 注:用\.轉義.
print(ret.group())
# 3.驗證URL(規則:http/https/ftp+:+//+任意非空白字符)
text = "http://www.baidu.com"
ret = re.match("(http|https|ftp)://[^\s]+",text) # 用(|)代表或
print(ret.group())
# 4.驗證身份證(規則:18位,且前17位位數字,第18位可以是數字,x或者X)
text = "325621198507267315"
ret = re.match("[1-9]\d{16}[\dzZ]",text)
print(ret.group())
# 5.匹配0-100之間的數字,(規則:09,101不能出現)
# 分析:一位的:1,兩位的99,三位的100
text = "100"
ret = re.match("([0-9]$|[1-9]\d$|100$)",text) # ret = re.match("([1-9]\d?$|100$)",text)
print(ret.group())
# 6.拉勾網網頁實例,爬取文字內容
# coding=gbk
import re
html = """
<dd class="job_bt">
        <h3 class="description">職位描述:</h3>
        <div>
        <p>職位描述:<br>1、參與公司用戶行爲數據的收集和實時計算開發;<br>2、根據業務需求實現實時和離線數據ETL過程<br>3、對外應用系統、數據服務接口的開發<br>4、開發實時數據處理、統計功能,支撐上層業務,如:數據監控、統計分析、日報展現、業務方調用等<br><br>任職要求:<br>1、計算機/軟件工程或相關專業出身,工作3年以上<br>2、紮實的代碼基礎;擅長java或scala。<br>3、熟悉大數據的生態圈和相關組件(hadoop、hive、spark、flink、kafka、hbase等),能夠深瞭解集羣和周邊模塊<br>4、對spark&nbsp;RDD模型有深刻的理解,能針對線上問題進行調優;<br>5、熟悉Mysql,Redis,能夠快速理解業務模型和數據模型<br>6、熟悉Linux環境及腳本開發(Python/Perl/Shell等)</p>
        </div>
    </dd>
"""
ret = re.sub("<.+?>","",html)
print(ret)
# 7.通過compile寫一個能取出裏面金額的封裝,並對每一步進行註釋
import re
text = "apple price is $10.05, orange price is $20.11"
r = re.compile(r"""
    \$ # $符號,用\進行了轉義
    \d+ # $後.前的數字
    \. # .本身
    \d+ # .後的數字部分
""",re.VERBOSE)
ret = re.findall(r,text)
print(ret)

# 5)古詩網爬蟲實例
import requests
import re
def main():
    base_url = "https://www.gushiwen.org/default_{}.aspx"
    for i in range(1,6):
        page_url = base_url.format(i)
        prase_page(page_url)
def prase_page(page_url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
        "Referer": "https://www.gushiwen.org/default_1.aspx"}
    response = requests.get(url=page_url,headers=headers)
    text = response.text
    # 思路:獲取所有的標題組成一個列表,同理獲取所有的其他同類信息組成列表,最後利用下標進行拼接
    # 1.注意:.不能匹配\n,可以添加re.DOTALL來設置.可以匹配任何值
    # 2.注意:要採用非貪婪模式,不然的話因爲各個塊中網頁格式一致的原則會捕獲到下一個同名標籤
    # 3.注意:使用findall時需要得到的部分需要用()括起來
    titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>',text,re.DOTALL) # cont標籤下的b屬性
    danasties = re.findall(r'<div\sclass="cont">.*?<p\sclass="source">.*?<a.*?>(.*?)</a>',text,re.DOTALL) # cont標籤下的第一個a屬性
    authors = re.findall(r'<div\sclass="cont">.*?<p\sclass="source">.*?<a.*?>.*?<a.*?>(.*?)</a>',text,re.DOTALL) # cont標籤下的第二個a屬性
    contents_raw = re.findall(r'<div\sclass="cont">.*?<div\sclass="contson".*?>(.*?)</div>',text,re.DOTALL) # cont標籤下的class屬性
    contents = []
    for content in contents_raw:
        a = re.sub(r"<.*?>","",content) # 剔除中間的<>
        a = a.strip() # 刪除空格
        contents.append(a)
    # 使用zip函數打包
    poems = []
    for values in zip(titles,danasties,authors,contents):
        title,danasty,author,content = values
        poem = {"title":title,"danasty":danasty,"author":author,"content":content}
        poems.append(poem)
    for poem in poems:
        print(poem)
        print("="*120)
if __name__ == '__main__':
    main()

# 6)糗事百科爬蟲:順便吐槽一下找不到笑點
import re
import requests
def url():
    base_url = "https://www.qiushibaike.com/text/page/{}/"
    for i in range(1,31):
        page_url = base_url.format(i)
        prase_url(page_url)
def prase_url(page_url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
              "Referer": "https: // www.qiushibaike.com / text / page / 1 /"}
    response = requests.get(url=page_url,headers=headers)
    text = response.text
    contents_raw = re.findall('<div\sclass="content">.*?<span>(.*?)</span>',text,re.DOTALL)
    contents = []
    for content in contents_raw:
        a = re.sub("\n","",content)
        a = re.sub("<.*?>","",a)
        a = a.strip()
        contents.append(a)
    for content in contents:
        print(content)
        print("="*120)
if __name__ == '__main__':
    url()
1.附件I-html

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
    <title>職位搜索 | 社會招聘 | Tencent 騰訊招聘</title>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
	<!-- Js Css -->
     		<link media="screen" href="//cdn.m.tencent.com/hr_static/css/all.css?max_age=86412" type="text/css" rel="stylesheet" />
	<script type="text/javascript" src="//cdn.m.tencent.com/hr_static/js/jquery-1.7.2.min.js"></script>
    <script type="text/javascript" src="//cdn.m.tencent.com/hr_static/js/jquery-ui-1.7.2.custom.min.js"></script>
    <script type="text/javascript" src="//cdn.m.tencent.com/hr_static/js/thickbox.js"></script>
    <link media="screen" href="//cdn.m.tencent.com/hr_static/css/thickbox.css" type="text/css" rel="stylesheet" />
    <script type="text/javascript" src="//cdn.m.tencent.com/hr_static/js/functions.js"></script>
    <script type="text/javascript" src="//cdn.m.tencent.com/hr_static/js/utils.js"></script>
    <script language="javascript" src="//vm.gtimg.cn/tencentvideo/txp/js/txplayer.js" charset="utf-8"></script>
    <script type="text/javascript" src="//cdn.m.tencent.com/hr_static/js/all.js?max_age=86412"></script>	<!-- Js Css -->
	<script>
		var keywords_json = ["python"];
	</script>
</head>

<body>
    	<div id="header">
    	<div class="maxwidth">
    		<a href="index.php" class="left" id="logo"><img src="//cdn.m.tencent.com/hr_static/img/logo.png"/></a>
    		<div class="right" id="headertr">
    			<div class="right pl9" id="topshares">
    				<div class="shares">
    					<span class="left">分享到:</span>
		    			<!--<a href="javascript:;" οnclick="shareto('qqt','top');" id="qqt" title="分享到騰訊微博">分享到騰訊微博</a>-->
		    			<a href="javascript:;" οnclick="shareto('qzone','top');" id="qzone" title="分享到QQ空間">分享到QQ空間</a>
		    			<!--<a href="javascript:;" οnclick="shareto('pengyou','top');" id="pengyou" title="分享到騰訊朋友">分享到騰訊朋友</a>-->
		    			<a href="javascript:;"  οnclick="shareto('sinat','top');"id="sinat" title="分享到新浪微博">分享到新浪微博</a>
		    			<!--<a href="javascript:;"  οnclick="shareto('renren','top');"id="renren" title="分享到人人網">分享到人人網</a>-->
		    			<!--<a href="javascript:;"  οnclick="shareto('kaixin001','top');"id="kaixin" title="分享到開心網">分享到開心網</a>-->
		    			<div class="clr"></div>
    				</div>
    				<!--<a href="javascript:;">分享</a>-->
    			</div>
    			<!--<div class="right pl9">-->
    				<!--<a href="http://t.qq.com/QQjobs" id="tqq" target="_blank">收聽騰訊招聘</a>-->
    			<!--</div>-->
    			<div class="right pr9">
    				    				    					<a href="login.php" id="header_login_anchor">登錄</a><span class="plr9">|</span><a href="reg.php">註冊</a>
    				    				<span class="plr9">|</span><a href="question.php">反饋建議</a>
    				<span class="plr9">|</span><a href="http://careers.tencent.com/global" target="_blank">Tencent Global Talent</a>
    				<script>
    					var User_Account = "";
    				</script>
    				    			</div>
    			<div class="clr"></div>
    		</div>
    		<div class="clr"></div>
    	</div>
    	<div id="menus">
    		<div class="maxwidth">
	    		<ul id="menu" class="left">
	    			<li id="nav1" ><a href="index.php">&nbsp;</a></li>
	    			<li id="nav2" class="active" ><a href="social.php">&nbsp;</a></li>
	    			<li id="nav3"><a href="about.php">&nbsp;</a></li>
	    			<li id="nav4"><a href="workInTencent.php">&nbsp;</a></li>
	    		</ul>
	    		<a class="right texti9" target="_blank" id="navxy" href="http://join.qq.com">校園招聘</a>
	    		<div class="clr"></div>
	    	</div>
    	</div>
    </div>    <div id="sociaheader">
			</div>
    <div id="position" class="maxwidth">
    	<a name="a" id="a"></a>
    	<div class="left wcont_b box">
		    <div class="blueline"><div class="butzwss"></div></div>
		    <form id="searchform" class="buts1">
		    	<div id="searchrow1">
		    		<div id="search1"><input id="search2" name="keywords" t="請輸入關鍵詞" value="python" class="left"/><input class="left" id="search3" type="submit" value=""/><div class="clr"></div></div>
		    		<input type="hidden" name="lid" value="0"/>
		    		<input type="hidden" name="tid" value="0"/>
		    	</div>
		    	<div id="searchrow2">
		    		<div class="srow2l left"></div>
		    		<div class="left items pl9 itemnone" id="additems">
		    			<a href="position.php?keywords=python&tid=0" class="item active"><span><font>全部</font></span></a>
		    					    				<a class="item" href="position.php?keywords=python&tid=0&lid=2218"><span><font>深圳</font></span></a>
		    					    				<a class="item" href="position.php?keywords=python&tid=0&lid=2156"><span><font>北京</font></span></a>
		    					    				<a class="item" href="position.php?keywords=python&tid=0&lid=2175"><span><font>上海</font></span></a>
		    					    				<a class="item" href="position.php?keywords=python&tid=0&lid=2196"><span><font>廣州</font></span></a>
		    					    				<a class="item" href="position.php?keywords=python&tid=0&lid=2268"><span><font>成都</font></span></a>
		    					    				<a class="item" href="position.php?keywords=python&tid=0&lid=2252"><span><font>杭州</font></span></a>
		    					    				<a class="item" href="position.php?keywords=python&tid=0&lid=2426"><span><font>昆明</font></span></a>
		    					    				<a class="item" href="position.php?keywords=python&tid=0&lid=33"><span><font>美國</font></span></a>
		    					    				<a class="item" href="position.php?keywords=python&tid=0&lid=2459"><span><font>中國香港</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=2418"><span><font>長春</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=62"><span><font>歐洲</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=2226"><span><font>重慶</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=2355"><span><font>武漢</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=90"><span><font>荷蘭</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=2393"><span><font>太原</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=2406"><span><font>瀋陽</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=2381"><span><font>西安</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=2436"><span><font>貴陽</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=2442"><span><font>呼和浩特</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=2448"><span><font>銀川</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=2225"><span><font>天津</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=2228"><span><font>南京</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=2346"><span><font>鄭州</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=59"><span><font>日本</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=2314"><span><font>南寧</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=2439"><span><font>蘭州</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=2336"><span><font>石家莊</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=2283"><span><font>福州</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=81"><span><font>新加坡</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=2407"><span><font>大連</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=2453"><span><font>烏魯木齊</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=60"><span><font>馬來西亞</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=95"><span><font>雄安新區</font></span></a>
		    					    				<a class="item itemhide" href="position.php?keywords=python&tid=0&lid=2280"><span><font>海口</font></span></a>
		    					    		</div>
							    		<div class="left"><a href="javascript:;" class="more2">更多</a></div>
							    		<div class="clr"></div>
		    	</div>
		    	<div id="searchrow3">
		    		<div class="srow2l left"></div>
		    		<div class="left items pl9">
		    			<a href="position.php?keywords=python&lid=0" class="item active"><span><font>全部</font></span></a>
		    					    				<a class="item" href="position.php?keywords=python&lid=0&tid=87"><span><font>技術類</font></span></a>
		    					    				<a class="item" href="position.php?keywords=python&lid=0&tid=82"><span><font>產品/項目類</font></span></a>
		    					    				<a class="item" href="position.php?keywords=python&lid=0&tid=83"><span><font>市場類</font></span></a>
		    					    				<a class="item" href="position.php?keywords=python&lid=0&tid=81"><span><font>設計類</font></span></a>
		    					    				<a class="item" href="position.php?keywords=python&lid=0&tid=84"><span><font>職能類</font></span></a>
		    					    				<a class="item" href="position.php?keywords=python&lid=0&tid=85"><span><font>內容編輯類</font></span></a>
		    					    				<a class="item" href="position.php?keywords=python&lid=0&tid=86"><span><font>客戶服務類</font></span></a>
		    					    		</div>
		    		<div class="clr"></div>
		    	</div>
		    </form>
		    <table class="tablelist" cellpadding="0" cellspacing="0">
		    	<tr class="h">
		    		<td class="l" width="374">職位名稱</td>
		    		<td>職位類別</td>
		    		<td>人數</td>
		    		<td>地點</td>
		    		<td>發佈時間</td>
		    	</tr>
		    			    	<tr class="even">
		    		<td class="l square"><a target="_blank" href="position_detail.php?id=44592&keywords=python&tid=0&lid=0">OMG097-數據平臺運維(北京)</a></td>
					<td>技術類</td>
					<td>1</td>
					<td>北京</td>
					<td>2018-09-30</td>
		    	</tr>
		    			    	<tr class="odd">
		    		<td class="l square"><a target="_blank" href="position_detail.php?id=44578&keywords=python&tid=0&lid=0">MIG16-基礎架構工程師(北京)</a><span class="hot">&nbsp;</span></td>
					<td>技術類</td>
					<td>2</td>
					<td>北京</td>
					<td>2018-09-30</td>
		    	</tr>
		    			    	<tr class="even">
		    		<td class="l square"><a target="_blank" href="position_detail.php?id=44559&keywords=python&tid=0&lid=0">18796-專項技術測試(深圳)</a><span class="hot">&nbsp;</span></td>
					<td>技術類</td>
					<td>2</td>
					<td>深圳</td>
					<td>2018-09-30</td>
		    	</tr>
		    			    	<tr class="odd">
		    		<td class="l square"><a target="_blank" href="position_detail.php?id=44546&keywords=python&tid=0&lid=0">SNG17-QQ錢包後臺開發工程師(深圳)</a></td>
					<td>技術類</td>
					<td>1</td>
					<td>深圳</td>
					<td>2018-09-30</td>
		    	</tr>
		    			    	<tr class="even">
		    		<td class="l square"><a target="_blank" href="position_detail.php?id=44547&keywords=python&tid=0&lid=0">MIG09-NLP算法工程師</a></td>
					<td>技術類</td>
					<td>1</td>
					<td>北京</td>
					<td>2018-09-30</td>
		    	</tr>
		    			    	<tr class="odd">
		    		<td class="l square"><a target="_blank" href="position_detail.php?id=44543&keywords=python&tid=0&lid=0">SNG07-測試開發高級工程師</a></td>
					<td>技術類</td>
					<td>1</td>
					<td>深圳</td>
					<td>2018-09-30</td>
		    	</tr>
		    			    	<tr class="even">
		    		<td class="l square"><a target="_blank" href="position_detail.php?id=44539&keywords=python&tid=0&lid=0">SNG11-人工智能研究員(深圳)</a><span class="hot">&nbsp;</span></td>
					<td>技術類</td>
					<td>1</td>
					<td>深圳</td>
					<td>2018-09-30</td>
		    	</tr>
		    			    	<tr class="odd">
		    		<td class="l square"><a target="_blank" href="position_detail.php?id=44521&keywords=python&tid=0&lid=0">18435-反洗錢建模工程師</a><span class="hot">&nbsp;</span></td>
					<td>技術類</td>
					<td>2</td>
					<td>深圳</td>
					<td>2018-09-30</td>
		    	</tr>
		    			    	<tr class="even">
		    		<td class="l square"><a target="_blank" href="position_detail.php?id=44526&keywords=python&tid=0&lid=0">18796-後臺專項測試工程師(深圳)</a></td>
					<td>技術類</td>
					<td>1</td>
					<td>深圳</td>
					<td>2018-09-30</td>
		    	</tr>
		    			    	<tr class="odd">
		    		<td class="l square"><a target="_blank" href="position_detail.php?id=44514&keywords=python&tid=0&lid=0">22989-專有云中間件運維工程師(深圳/北京/上海)</a></td>
					<td>技術類</td>
					<td>2</td>
					<td>深圳</td>
					<td>2018-09-30</td>
		    	</tr>
		    			    	<tr class="f">
		    		<td colspan="5">
		    			<div class="left">共<span class="lightblue total">544</span>個職位</div>
		    			<div class="right"><div class="pagenav"><a href="javascript:;" class="noactive" id="prev">上一頁</a><a class="active" href="javascript:;">1</a><a href="position.php?lid=&tid=&keywords=python&start=10#a">2</a><a href="position.php?lid=&tid=&keywords=python&start=20#a">3</a><a href="position.php?lid=&tid=&keywords=python&start=30#a">4</a><a href="position.php?lid=&tid=&keywords=python&start=40#a">5</a><a href="position.php?lid=&tid=&keywords=python&start=50#a">6</a><a href="position.php?lid=&tid=&keywords=python&start=60#a">7</a><a href="position.php?lid=&tid=&keywords=python&start=70#a">...</a><a href="position.php?lid=&tid=&keywords=python&start=540#a">55</a><a href="position.php?lid=&tid=&keywords=python&start=10#a" id="next">下一頁</a><div class="clr"></div></div></div>
		    			<div class="clr"></div>
		    		</td>
		    	</tr>
		    </table>
		</div>
		<div class="right wcont_s box">
		    <div class="blueline"><div class="butcjwt"></div></div>
		    <div class="module_faqs square"><a href="faq.php?id=5" title="如何應聘騰訊公司的職位?">如何應聘騰訊公司的職位?</a><a href="faq.php?id=3" title="應屆生如何應聘?">應屆生如何應聘?</a><a href="faq.php?id=19" title="騰訊應聘流程是什麼?">騰訊應聘流程是什麼?</a><a href="faq.php?id=20" title="我註冊了簡歷,但爲什麼沒有人聯繫我?">我註冊了簡歷,但爲什麼沒...</a><a href="faq.php?id=22" title="我忘記密碼了,怎麼辦?">我忘記密碼了,怎麼辦?</a><a href="faq.php?id=23" title="如何進行簡歷修改?">如何進行簡歷修改?</a></div>		</div>
		<div class="clr"></div>
	</div>
   	<div id="homeDep"><table id="homeads"><tr><td align="center"><a href="http://tencent.avature.net/career" target="blank">全球招聘</a></td><td align="center"><a href="http://game.qq.com/hr/" target="blank">互動娛樂事業羣招聘</a></td><td align="center"><a href="http://hr.tencent.com/position.php?lid=&tid=&keywords=WXG" target="blank">微信事業羣招聘</a></td><td align="center"><a href="http://hr.qq.com/" target="blank">技術工程事業羣招聘</a></td></tr></table></div>    	<div id="footer">
		<div>
			<a href="http://www.tencent.com/" target="_blank">關於騰訊</a><span>|</span><a href="http://www.qq.com/contract.shtml" target="_blank">服務條款</a><span>|</span><a href="http://hr.tencent.com/" target="_blank">騰訊招聘</a><span>|</span><a href="http://careers.tencent.com/global" target="_blank">Tencent Global Talent</a><span>|</span><a href="http://gongyi.qq.com/" target="_blank">騰訊公益</a><span>|</span><a href="http://service.qq.com/" target="_blank">客服中心</a>
	    </div>
		<p>Copyright &copy; 1998 - 2018 Tencent. All Rights Reserved.</p>
	</div>
	<script type="text/javascript" src="//tajs.qq.com/stats?sId=64934792" charset="UTF-8"></script>
</body>
</html>

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章