python爬蟲及其他知識記錄

pip install mysql-connector-python -i https://pypi.doubanio.com/simple

數據庫

python2.5之後自帶sqlite3數據庫

import sqlite3
#不存在test.db數據庫，就會自動重建
conn = sqlite3.connect('test.db')
c = conn.cursor()
c.execute('insert into articleInfo (title,url,date,wechatName,state) values(?,?,?,?,?)',(self.title[0].text,self.url,self.date[0].text,self.gzh[0].text,self.state))
conn.commit()
conn.close()

使用mysql

安裝：pip install mysql-connector-python -i https://pypi.doubanio.com/simple

   python
   import mysql.connector
   conn = mysql.connector.connect(user='root', password='password', database='test',   charset='utf8')
   cursor = conn.cursor()
   cursor.execute('create table user (id varchar(20) primary key, name varchar(20))')
   cursor.execute('select * from user where id = %s', ('1',))
    values = cursor.fetchall()
   flag=  cursor.rowcount
   conn.commit()
   cursor.close()

使用selenium編寫爬蟲，爬取微信公衆號文章
```
//安裝
pip install selenium
```
- 瀏覽器驅動安裝(將chromedriver.exe或geckodriver.exe加入環境變量，版本注意要和瀏覽器版本兼容)
http://npm.taobao.org/mirrors/chromedriver/

ip代理

網站：攜趣http代理

提供每天一條免費長效代理（需要和客服溝通獲得賬號和密碼）或臨時ip

其他臨時代理網站：

https://www.kuaidaili.com/free/

http://www.data5u.com/

https://github.com/jhao104/proxy_pool

https://www.freeip.top/ （優秀）

圖牀：https://hashx.cn/image/xHkW

反扒https://blog.csdn.net/congpao4329/article/details/100367345

qtdesigner安裝

pip install pyqt5

pip install PyQt5-tools

\Python\Python37\Lib\site-packages\pyqt5_tools\Qt\bin下designer.exe

寫入文件

with open("爬蟲日誌.txt", "a", encoding="utf-8") as f:
        f.write(message+'\n')

創建目錄

import os
try:
    path='./發改局信息文件夾'
    isExists=os.path.exists(path)
    if not isExists:
        os.mkdir(path)
    else:
        print('./發改局信息文件夾存在')
except Exception as e:
    print('創建文件夾失敗')

操作excel

import xlwt
wb = xlwt.Workbook()
ws = wb.add_sheet(sheetName,cell_overwrite_ok=True)
ws.write(0,1,'環比(上月=100)')			# 前兩個參數爲行和列
ws.write(0,2,'同比(上年同月=100)')
ws.write(0,3,'定基（2015年=100')
wb.save('./國家統計局數據.xls')

tkinter界面

from tkinter import *
import threading
# 此方法爲了防止按鈕點擊下去，函數還未處理完成，按鈕將不會彈上來，加入多線程，讓界面和處理異步
def button_func():
    th=threading.Thread(target=main)     #main是處理函數
    th.start()
win = Tk()
win.title('soei數據爬取工具')
win.geometry("400x400+200+50")
lb = Label(win,text='房地產宏觀經濟指標需求信息爬取',fg='blue')
lb.pack()
h1 = BooleanVar() #設置選擇框對象
h2 = BooleanVar()
cb1 = Checkbutton(win,text='所有數據',variable=h1,command=judgeChoose)
cb1.pack()
cb2 = Checkbutton(win,text='當月數據',variable=h2,command=judgeChoose)
cb2.pack()
button=Button(win,command=button_func,text='開始爬取')
button.pack()
text=Text(win,width=30,height=10)
text.pack()
#text插入
text.insert('insert','爬取完成')
#text獲取值
value=text1.get('0.0', 'end')
win.mainloop()

#treeview控件
tree=ttk.Treeview(win,columns=['1','2','3','4'],show='headings')
# tree.column('1',width=280,anchor='center')
tree.column('1',width=280)
tree.column('2',width=90)
tree.column('3',width=90)
tree.column('4',width=60,anchor='center')
tree.heading('1',text='文章名')
tree.heading('2',text='發佈日期')
tree.heading('3',text='公衆號')
tree.heading('4',text='是否有效')
#----vertical scrollbar------------滑動條
vbar = ttk.Scrollbar(win,orient=VERTICAL,command=tree.yview)
tree.configure(yscrollcommand=vbar.set)
tree.grid(row=1,column=1,columnspan=4,sticky=NSEW)
vbar.grid(row=1,column=5,sticky=NS)
#輸入框
entry=Entry(win,width=30)
entry.grid(row=0,column=1)
#entry值獲取
url=entry.get()
#treeview值輸入
value=(self.title[0].text,self.date[0].text,self.gzh[0].text,flag)
#彈出提示框
tkinter.messagebox.askokcancel('提示','下載結束')
tree.insert('','end',values=value)

獲取返回json數據

import json
#根據response=requests.get(url)獲取返回的response
owDataNode = json.loads(response.text).get('returndata').get('wdnodes')[0].get('nodes')[0]

獲得系統時間

>> import datetime
>>> print('系統時間： '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
系統時間： 2020-01-16 12:00:13
#----------------------------------------
>> import time
>> bTime=time.time()
>> eTime=time.time()
>> totalTime=eTime-bTime
>> a =totalTime/60
>> b =totalTime%60
>> print('總用時： '+str(int(a))+'分'+str(int(b))+'秒')
總用時： 4分48秒

selenium使用(chrome)

//安裝
pip install selenium

chromedriver.exe(瀏覽器驅動，需要與瀏覽器版本匹配)設置環境變量

opt = webdriver.ChromeOptions()
#設置chrome瀏覽器爲無頭模式----------------------
# opt.add_argument('--headless')
#----------------------------------------------
#不加載圖片-------------------------------------
# prefs = profile.managed_default_content_settings.images":2}
# opt.add_experimental_option("prefs",prefs)
#----------------------------------------------
#請求頭設置
opt.add_argument('user-agent='+str(UserAgent().random))
#代理ip設置
ip='ip:port'
opt.add_argument('--proxy-server=http://' +ip)
browser = webdriver.Chrome(options = opt)
#獲得當前頁面的url
url=browser.current_url
#由xpath匹配並獲取元素
title=browser.find_elements_by_xpath('/html/body/div[2]/div[1]/div[3]/ul/li['+str(i)+']/div[2]/h3/a')
#輸出元素的文字
print(title[0].text)
#根據xpath查找元素並移動到該元素處並點擊
action=ActionChains(browser)           elementLocation=browser.find_elements_by_xpath('/html/body/div[2]/div[1]/div[3]/ul/li['+str(i)+']/div[2]/h3/a')[0]          action.move_to_element(elementLocation).click().perform()

xpath語法

.//div[@class="TRS_PreAppend"]/div[2]/table/tbody/tr[1]/td[2]/p/span/text()
//*[@id="sogou_vr_11002601_account_1"]

requests庫使用

get請求

import requests
from fake_useragent import UserAgent
headers= {'User-Agent':str(UserAgent().random)}
#設置請求頭和超時時間
response =requests.get(url,headers=headers,timeout=5)
#返回碼：200，404
code=response.status_code 
#返回頁面源碼
content=response.text

post請求

  data={'page':page,'size':size,'sort':'','id':id,'buildNo':buildNo}
  response=requests.post(url,data,headers=headers)

urllib下載圖片

 import urllib.request
 response=urllib.request.urlopen(url)
 img=response.read()
 name=tname
 with open(name,'wb') as f:
     f.write(img)

輸出錯誤

import traceback
 try:
 except Exception as e:
     print(traceback.format_exc())

刪除文件

import os
os.remove(File)
------------------------
#如果文件存在，就刪除
if os.path.exists(File):
		os.remove(File)
------------------------
#可以一次創建父目錄和子目錄
#path='duzhenwen\\file\\'
makeDir(path)

退出進程

#如果執行失敗，不退出，會導致進程一致被佔用，無法使用其佔用的文件
#執行失敗，返回1
os._exit(1)
#執行成功，返回0
os._exit(0)

字符串中是否包含另一字符串

if (str1.find('str2'))!=(-1);  #查找不到返回-1
	str3=str1.replace('old','new')  #用new替換old

讀取json文件

jsonString=open(jsonFile,encoding='utf-8')
jsonData=json.load(jsonString)
#讀取
name=jsonData['name']
#關閉
jsonString.close()
os.remove(jsonFile)

python爬蟲及其他知識記錄

數據庫

ip代理

qtdesigner安裝

寫入文件

創建目錄

操作excel

tkinter界面

獲取返回json數據

獲得系統時間

selenium使用(chrome)

xpath語法

requests庫使用

get請求

post請求

urllib下載圖片

輸出錯誤

刪除文件

退出進程

字符串中是否包含另一字符串

讀取json文件

HTTP URL 詳解

springBoot中文文檔

excel數據導入mysql

git上傳本地項目到coding

阿里雲服務器部署java項目

c#發送請求訪問外部接口

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結