Selenium 解析清華鏡像

原創

凌空的桨

2019-08-07 22:46

首先配置好Selenium和chrome driver，這個不再贅述，接着就是去解析了。

還算是比較有規律

所以大概有兩種方案，我把這兩種方案都放在一起。

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
import time
import random

driver_1 = webdriver.Chrome()
driver_1.implicitly_wait(0)
url_1 = 'https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/'
driver_1.get(url_1)
tr_list_1=driver_1.find_elements_by_tag_name("tr")
for tr_1 in tr_list_1:
    td_list_1 = tr_1.find_elements_by_tag_name("td")
    if(len(td_list_1)>0):
        text_1=tr_1.find_elements_by_tag_name("td")[0].text
        if(len(text_1)==3):
            url_2 = url_1+str(text_1)  
            #'https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/00/'
            driver_2  = webdriver.Chrome()
            time_2 = random.randint(1,10)
            driver_2.get(url_2)
            tr_list_2=driver_2.find_elements_by_tag_name("tr")
            for tr_2 in tr_list_2:
                td_list_2 = tr_2.find_elements_by_tag_name("td")
                if(len(td_list_2)>0):
                    text_2=tr_2.find_elements_by_tag_name("td")[0].text
                    if(len(text_2)==3):
                        url_3 = url_2+str(text_2)  
#'https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/00/00/'
                        driver_3 = webdriver.Chrome()
                        time_3 = random.randint(1,10)
                        driver_3.get(url_3)
                        # tr_list_3=driver_3.find_elements_by_tag_name("tr")
                        # print(tr_list_3)
                        # for tr_3 in tr_list_3:
                        #     td_list_3 = tr_3.find_elements_by_tag_name("td")
                        #     if(len(td_list_3)>0):
                        #         text_3=tr_3.find_elements_by_tag_name("td")[0].text
                        #         if(len(text_3)>17):
                        for link_3 in driver_3.find_elements_by_xpath("//*[@href]"):
                            if(len(str(link_3.get_attribute('href')))>110):
                                url_4 = str(link_3.get_attribute('href'))
                                driver_4 = webdriver.Chrome()
                                time_4 = random.randint(1,10)
                                driver_4.get(url_4)
                                for link_4 in driver_4.find_elements_by_xpath("//*[@href]"):
                                    check_txt = str(link_4.get_attribute('href'))
                                    if(len(check_txt)>130):

                                        f_file = open('tsinghua_url.txt','a')
                                        f_file.write(check_txt+'\n')
                                        

# https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/00/02/167a1d7be500797b57a6db0c628ac81f5dd646291e624a8b3e7a9cef2417/
                                    # button_3 = tr_3.find_elements_by_tag_name("td")[0]
                                    # b_temp = tr_3.title
                                    # # a_tmp = button_3.click()
                                    # print(b_temp)
                                    # driver_3.switch_to.window(driver_3.window_handles[0])
                                    # print(driver_3.window_handles[0])

                                    # driver_4 = webdriver.Chrome()
                                    # time_4 = random.randint(1,10)
                                    # driver_4.get(url_4)
                                    # tr_list_4=driver_3.find_elements_by_tag_name("tr")
                                    # for tr_4 in tr_list_4:
                                    #     td_list_4 = tr_4.find_elements_by_tag_name("td")
                                    #     if(len(td_list_4)>0):
                                           
                                    #         text_4=tr_4.find_elements_by_tag_name("td")[0].text
                                    #         if(len(text_4)>13):
                                    #             button_4 = tr_4.find_elements_by_tag_name("td")[0]
                                    #             button_4.click()
                                    #             url_5 = url_4+str(text_4) 
                                    #             print(url_5)
                                time.sleep(time_4)
                                driver_4.close()
                                driver_4.quit()
                        time.sleep(time_3)
                        driver_3.close()
                        driver_3.quit()
            time.sleep(time_2)
            driver_2.close()
            driver_2.quit()
time.sleep(1)
driver_1.close()
driver_1.quit()

一定要去看一個網頁的源碼！

第二個版本：

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
import time
import numpy as np 
import random

driver_1 = webdriver.Chrome()
driver_1.implicitly_wait(0)
url_1 = 'https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/'
driver_1.get(url_1)

for link_1 in driver_1.find_elements_by_xpath("//*[@href]"):
    
    text_1=str(link_1.get_attribute('href'))
    # print(text_1)
    if(len(text_1)==58 and text_1[-1]=='/'):
        url_2 = text_1
#'https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/00/'
        driver_2  = webdriver.Chrome()
        # time_2 = random.randint(1,5)
        time_2 = np.random.rand()
        driver_2.get(url_2)
        
        for link_2 in driver_2.find_elements_by_xpath("//*[@href]"):
            text_2=str(link_2.get_attribute('href'))
            # print(text_2)
            if(len(text_2)==61 and text_2[-1]=='/'):
                url_3 = text_2 
#'https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/00/00/'
                driver_3 = webdriver.Chrome()
                # time_3 = random.randint(1,5)
                time_3 = np.random.rand()
                driver_3.get(url_3)
                for link_3 in driver_3.find_elements_by_xpath("//*[@href]"):
                    text_3=str(link_3.get_attribute('href'))
                    if(len(text_3)==122):
                        url_4 = text_3
                        driver_4 = webdriver.Chrome()
                        # time_4 = random.randint(1,5)
                        time_4 = np.random.rand()
                        # try:

                        #     driver_4.get(url_4)
                        
                        # except Exception,e:
                        #     print(Exception,":",e)
                        try: 
                            driver_4.get(url_4)
                        except ZeroDivisionError as e:
                            print('except:', e)
                        finally:
                            driver_4.refresh()

                        for link_4 in driver_4.find_elements_by_xpath("//*[@href]"):
                            check_txt = str(link_4.get_attribute('href'))
                            if(len(check_txt)>130):
                                f_file = open('tsinghua_url.txt','a')
                                f_file.write(check_txt+'\n')
# https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/00/02/167a1d7be500797b57a6db0c628ac81f5dd646291e624a8b3e7a9cef2417/
                        time.sleep(time_4)
                        driver_4.close()
                        driver_4.quit()
                time.sleep(time_3)
                driver_3.close()
                driver_3.quit()
        time.sleep(time_2)
        driver_2.close()
        driver_2.quit()
time.sleep(1)
driver_1.close()
driver_1.quit()

網絡一定要穩定啊

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

Selenium 解析清華鏡像

lightdb hash index的性能和限制

[翻譯]Joint Discriminative and Generative Learning for Person Re-identification

Selenium 解析清華鏡像

修改opencv源碼並測試

Python-Opencv [ERROR: recursion is detected during loading of "cv2"

透視變換——小試

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結