python 批量爬取郵箱

python 批量爬取郵箱地址

#coding: utf-8
import requests
import bs4 #解析網頁
import lxml
import re

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}

proxyip = {
     'http': '121.13.252.61:41564'
}

#獲取該頁面所有url 地址,包括分頁地址,返回除了本頁面以外的所有分頁地址
def fetch_url():
    url_list = []
    page_obj = requests.get('https://www.douban.com/group/topic/165453665/?start=100&_i=9637470y8YseOC',headers=headers,proxies=proxyip)
    bs4_obj = bs4.BeautifulSoup(page_obj.text,"lxml") #指定lxml 解析器
    #print(bs4_obj.text)
    #需要分析頁面前端代碼,div 以及標籤選擇器
    comment_else = bs4_obj.find_all(name="div",attrs={"class":"paginator"}) #comment_else 是一個大的列表
    for element in comment_else:
      comment_else_url = element.find("a") #查找所有 a 標籤
      
      #<a href="https://www.douban.com/group/topic/165453665/?start=0">1</a> ,獲取地址 https://www.douban.com/group/topic/165453665/?start=0
      comment_else_url = comment_else_url.attrs.get("href") #未帶class 屬性,直接提取href,獲取真實分頁http 地址
      url_list.append(comment_else_url)
      #print(comment_else_url.attrs.get("href"))
    return url_list

def fetch_mail_address(url):
    page_obj = requests.get(url,headers=headers,proxies=proxyip)
    #print(page_obj.text)
    bs4_obj = bs4.BeautifulSoup(page_obj.text,"lxml") #指定lxml 解析器
    comment_else = bs4_obj.find_all(name="div",attrs={"class":"reply-doc content"}) #comment_else 是一個大的列表
    #print(comment_else)
    #print(len(comment_else))
    # <span class="all ref-content">[email protected]
    # <p class="reply-content">[email protected]謝謝樓主</p>

    mail_list = []
    for ele in comment_else:
        comment_ele_p = ele.find("p",attrs={"class":"reply-content"})
        comment_ele_pubtime = ele.find("span", attrs={"class": "pubtime"})
        #print(comment_ele_p.text,"-------------")
        #re 正則匹配郵箱地址
        comment_ele_address = re.search("\w+@\w+.\w+",comment_ele_p.text,flags=re.A) #flags防止匹配帶上漢字
        #print(comment_ele_pubtime)
        if comment_ele_address:
            #print()
            mail_list.append([comment_ele_address.group(),comment_ele_pubtime.text])
    return mail_list


if __name__ == "__main__":
  url_list = fetch_url()
  mail_total_list = []
  for url in url_list:
    mail_list = fetch_mail_address(url)
    mail_total_list.append(mail_list)
  mail_now_page_mail = fetch_mail_address('https://www.douban.com/group/topic/165453665/?start=100&_i=9637470y8YseOC')
  mail_total_list.append(mail_now_page_mail)
  #print(mail_total_list)
  print('----------------------------------------------')

  mail_new_total_list = []
  for ilist in mail_total_list:
    for mail in ilist:
      #print(mail[0])
      mail_new_total_list.append(mail[0])
  
  print(mail_new_total_list)
  print(len(mail_new_total_list))
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章