目錄 |
---|
0. 項目介紹 |
1. 構造瀏覽器實例 |
2. 爬取賬戶頁中所有帖子的鏈接 |
3. 爬取並下載帖子頁中的視頻 |
4. 完整代碼 |
0. 項目介紹
本項目的目的是輸入指定TikTok賬戶頁的鏈接,輸出該賬戶每一個帖子的視頻。本項目的基本結構參考這篇博文。
本項目需要先導入如下庫:
from selenium import webdriver
import json, time, os
本項目的全局變量如下:
sslPort =
fxBinaryPath = ''
geckodriverPath = ''
pageDownJS = 'document.documentElement.scrollTop=100000000'
outputPath = ''
wgetPath = ''
httpsProxy = 'https://127.0.0.1:{}/'.format(str(sslPort))
本項目的基本結構如下:
def Main():
profileUrl = input('Please input the tiktok profile link: ')
urlList = PROFILE().Main(profileUrl)
for url in urlList:
POST().Main(url)
1. 構造瀏覽器實例
fxProfile = webdriver.firefox.firefox_profile.FirefoxProfile()
fxProfile.set_preference('network.proxy.type', 1)
fxProfile.set_preference('network.proxy.ssl', '127.0.0.1')
fxProfile.set_preference('network.proxy.ssl_port', sslPort)
fxProfile.set_preference('network.proxy.socks_remote_dns', True)
fxProfile.set_preference('network.trr.mode', 2)
fxProfile.set_preference('permissions.default.image', 2)
fxProfile.set_preference('intl.accept_languages', 'zh-CN, zh, zh-TW, zh-HK, en-US, en')
fxDriver = webdriver.firefox.webdriver.WebDriver(firefox_profile=fxProfile, firefox_binary=fxBinaryPath, executable_path=geckodriverPath)
2. 爬取賬戶頁中所有帖子的鏈接
class PROFILE(object):
def GetLocY(self):
for e in fxDriver.find_elements_by_xpath('//a[contains(@class, "video-feed-item-wrapper")]'):
locY = e.location['y']
return locY
def JudgeLoading(self, locY):
time.sleep(0.5)
locYNew = PROFILE().GetLocY()
if locY < locYNew:
return locYNew
else:
return None
def GetPostUrl(self):
urlList = []
for e in fxDriver.find_elements_by_xpath('//a[contains(@class, "video-feed-item-wrapper")]'):
url = e.get_attribute('href')
urlList.append(url)
return urlList
def GetWholePage(self):
locY = PROFILE().GetLocY()
loadFailCount = 0
while 1:
fxDriver.execute_script(pageDownJS)
while 1:
locYNew = PROFILE().JudgeLoading(locY)
if locYNew == None:
loadFailCount += 1
if loadFailCount > 20:
urlList = PROFILE().GetPostUrl()
return urlList
else:
loadFailCount = 0
locY = locYNew
break
def Main(self, profileUrl):
try:
fxDriver.get(profileUrl)
urlList = PROFILE().GetWholePage()
return urlList
except Exception as e:
print(e)
3. 爬取並下載帖子頁中的視頻
class POST(object):
def GetInfo(self, html):
jsonText = fxDriver.find_element_by_xpath('//script[@type="application/json"]').get_attribute('textContent')
jsonData = json.loads(jsonText)['props']
videoData = jsonData['pageProps']['videoData']
vidUrl = videoData['itemInfos']['video']['urls'][0]
createTimeStamp = int(videoData['itemInfos']['createTime'])
createTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(createTimeStamp))
uniqueID = videoData['authorInfos']['uniqueId']
return vidUrl, createTime, uniqueID
def DownloadFile(self, info):
vidUrl = info[0]
createTime = info[1].replace('-', '').replace(':', '').replace(' ', '')
uniqueID = info[2]
folder = '{}\\{}'.format(outputPath, uniqueID)
try:
os.makedirs(folder)
except Exception as e:
print(e)
os.system('{} --output-document={}\\{}.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, createTime, httpsProxy, vidUrl))
def Main(self, url):
try:
fxDriver.get(url)
info = POST().GetInfo(html)
POST().DownloadFile(info)
except Exception as e:
print(e)
4. 完整代碼
from selenium import webdriver
import json, time, os
sslPort =
fxBinaryPath = ''
geckodriverPath = ''
pageDownJS = 'document.documentElement.scrollTop=100000000'
outputPath = ''
wgetPath = ''
httpsProxy = 'https://127.0.0.1:{}/'.format(str(sslPort))
fxProfile = webdriver.firefox.firefox_profile.FirefoxProfile()
fxProfile.set_preference('network.proxy.type', 1)
fxProfile.set_preference('network.proxy.ssl', '127.0.0.1')
fxProfile.set_preference('network.proxy.ssl_port', sslPort)
fxProfile.set_preference('network.proxy.socks_remote_dns', True)
fxProfile.set_preference('network.trr.mode', 2)
fxProfile.set_preference('permissions.default.image', 2)
fxProfile.set_preference('intl.accept_languages', 'zh-CN, zh, zh-TW, zh-HK, en-US, en')
fxDriver = webdriver.firefox.webdriver.WebDriver(firefox_profile=fxProfile, firefox_binary=fxBinaryPath, executable_path=geckodriverPath)
class POST(object):
def GetInfo(self, html):
jsonText = fxDriver.find_element_by_xpath('//script[@type="application/json"]').get_attribute('textContent')
jsonData = json.loads(jsonText)['props']
videoData = jsonData['pageProps']['videoData']
vidUrl = videoData['itemInfos']['video']['urls'][0]
createTimeStamp = int(videoData['itemInfos']['createTime'])
createTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(createTimeStamp))
uniqueID = videoData['authorInfos']['uniqueId']
return vidUrl, createTime, uniqueID
def DownloadFile(self, info):
vidUrl = info[0]
createTime = info[1].replace('-', '').replace(':', '').replace(' ', '')
uniqueID = info[2]
folder = '{}\\{}'.format(outputPath, uniqueID)
try:
os.makedirs(folder)
except Exception as e:
print(e)
os.system('{} --output-document={}\\{}.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, createTime, httpsProxy, vidUrl))
def Main(self, url):
try:
fxDriver.get(url)
info = POST().GetInfo(html)
POST().DownloadFile(info)
except Exception as e:
print(e)
class PROFILE(object):
def GetLocY(self):
for e in fxDriver.find_elements_by_xpath('//a[contains(@class, "video-feed-item-wrapper")]'):
locY = e.location['y']
return locY
def JudgeLoading(self, locY):
time.sleep(0.5)
locYNew = PROFILE().GetLocY()
if locY < locYNew:
return locYNew
else:
return None
def GetPostUrl(self):
urlList = []
for e in fxDriver.find_elements_by_xpath('//a[contains(@class, "video-feed-item-wrapper")]'):
url = e.get_attribute('href')
urlList.append(url)
return urlList
def GetWholePage(self):
locY = PROFILE().GetLocY()
loadFailCount = 0
while 1:
fxDriver.execute_script(pageDownJS)
while 1:
locYNew = PROFILE().JudgeLoading(locY)
if locYNew == None:
loadFailCount += 1
if loadFailCount > 20:
urlList = PROFILE().GetPostUrl()
return urlList
else:
loadFailCount = 0
locY = locYNew
break
def Main(self, profileUrl):
try:
fxDriver.get(profileUrl)
urlList = PROFILE().GetWholePage()
return urlList
except Exception as e:
print(e)
def Main():
profileUrl = input('Please input the tiktok profile link: ')
urlList = PROFILE().Main(profileUrl)
for url in urlList:
POST().Main(url)
Main()
if __name__ == '__main__':
Main()