目录 |
---|
0. 项目介绍 |
1. 构造浏览器实例 |
2. 爬取账户页中所有帖子的链接 |
3. 爬取并下载帖子页中的视频 |
4. 完整代码 |
0. 项目介绍
本项目的目的是输入指定TikTok账户页的链接,输出该账户每一个帖子的视频。本项目的基本结构参考这篇博文。
本项目需要先导入如下库:
from selenium import webdriver
import json, time, os
本项目的全局变量如下:
sslPort =
fxBinaryPath = ''
geckodriverPath = ''
pageDownJS = 'document.documentElement.scrollTop=100000000'
outputPath = ''
wgetPath = ''
httpsProxy = 'https://127.0.0.1:{}/'.format(str(sslPort))
本项目的基本结构如下:
def Main():
profileUrl = input('Please input the tiktok profile link: ')
urlList = PROFILE().Main(profileUrl)
for url in urlList:
POST().Main(url)
1. 构造浏览器实例
fxProfile = webdriver.firefox.firefox_profile.FirefoxProfile()
fxProfile.set_preference('network.proxy.type', 1)
fxProfile.set_preference('network.proxy.ssl', '127.0.0.1')
fxProfile.set_preference('network.proxy.ssl_port', sslPort)
fxProfile.set_preference('network.proxy.socks_remote_dns', True)
fxProfile.set_preference('network.trr.mode', 2)
fxProfile.set_preference('permissions.default.image', 2)
fxProfile.set_preference('intl.accept_languages', 'zh-CN, zh, zh-TW, zh-HK, en-US, en')
fxDriver = webdriver.firefox.webdriver.WebDriver(firefox_profile=fxProfile, firefox_binary=fxBinaryPath, executable_path=geckodriverPath)
2. 爬取账户页中所有帖子的链接
class PROFILE(object):
def GetLocY(self):
for e in fxDriver.find_elements_by_xpath('//a[contains(@class, "video-feed-item-wrapper")]'):
locY = e.location['y']
return locY
def JudgeLoading(self, locY):
time.sleep(0.5)
locYNew = PROFILE().GetLocY()
if locY < locYNew:
return locYNew
else:
return None
def GetPostUrl(self):
urlList = []
for e in fxDriver.find_elements_by_xpath('//a[contains(@class, "video-feed-item-wrapper")]'):
url = e.get_attribute('href')
urlList.append(url)
return urlList
def GetWholePage(self):
locY = PROFILE().GetLocY()
loadFailCount = 0
while 1:
fxDriver.execute_script(pageDownJS)
while 1:
locYNew = PROFILE().JudgeLoading(locY)
if locYNew == None:
loadFailCount += 1
if loadFailCount > 20:
urlList = PROFILE().GetPostUrl()
return urlList
else:
loadFailCount = 0
locY = locYNew
break
def Main(self, profileUrl):
try:
fxDriver.get(profileUrl)
urlList = PROFILE().GetWholePage()
return urlList
except Exception as e:
print(e)
3. 爬取并下载帖子页中的视频
class POST(object):
def GetInfo(self, html):
jsonText = fxDriver.find_element_by_xpath('//script[@type="application/json"]').get_attribute('textContent')
jsonData = json.loads(jsonText)['props']
videoData = jsonData['pageProps']['videoData']
vidUrl = videoData['itemInfos']['video']['urls'][0]
createTimeStamp = int(videoData['itemInfos']['createTime'])
createTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(createTimeStamp))
uniqueID = videoData['authorInfos']['uniqueId']
return vidUrl, createTime, uniqueID
def DownloadFile(self, info):
vidUrl = info[0]
createTime = info[1].replace('-', '').replace(':', '').replace(' ', '')
uniqueID = info[2]
folder = '{}\\{}'.format(outputPath, uniqueID)
try:
os.makedirs(folder)
except Exception as e:
print(e)
os.system('{} --output-document={}\\{}.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, createTime, httpsProxy, vidUrl))
def Main(self, url):
try:
fxDriver.get(url)
info = POST().GetInfo(html)
POST().DownloadFile(info)
except Exception as e:
print(e)
4. 完整代码
from selenium import webdriver
import json, time, os
sslPort =
fxBinaryPath = ''
geckodriverPath = ''
pageDownJS = 'document.documentElement.scrollTop=100000000'
outputPath = ''
wgetPath = ''
httpsProxy = 'https://127.0.0.1:{}/'.format(str(sslPort))
fxProfile = webdriver.firefox.firefox_profile.FirefoxProfile()
fxProfile.set_preference('network.proxy.type', 1)
fxProfile.set_preference('network.proxy.ssl', '127.0.0.1')
fxProfile.set_preference('network.proxy.ssl_port', sslPort)
fxProfile.set_preference('network.proxy.socks_remote_dns', True)
fxProfile.set_preference('network.trr.mode', 2)
fxProfile.set_preference('permissions.default.image', 2)
fxProfile.set_preference('intl.accept_languages', 'zh-CN, zh, zh-TW, zh-HK, en-US, en')
fxDriver = webdriver.firefox.webdriver.WebDriver(firefox_profile=fxProfile, firefox_binary=fxBinaryPath, executable_path=geckodriverPath)
class POST(object):
def GetInfo(self, html):
jsonText = fxDriver.find_element_by_xpath('//script[@type="application/json"]').get_attribute('textContent')
jsonData = json.loads(jsonText)['props']
videoData = jsonData['pageProps']['videoData']
vidUrl = videoData['itemInfos']['video']['urls'][0]
createTimeStamp = int(videoData['itemInfos']['createTime'])
createTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(createTimeStamp))
uniqueID = videoData['authorInfos']['uniqueId']
return vidUrl, createTime, uniqueID
def DownloadFile(self, info):
vidUrl = info[0]
createTime = info[1].replace('-', '').replace(':', '').replace(' ', '')
uniqueID = info[2]
folder = '{}\\{}'.format(outputPath, uniqueID)
try:
os.makedirs(folder)
except Exception as e:
print(e)
os.system('{} --output-document={}\\{}.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, createTime, httpsProxy, vidUrl))
def Main(self, url):
try:
fxDriver.get(url)
info = POST().GetInfo(html)
POST().DownloadFile(info)
except Exception as e:
print(e)
class PROFILE(object):
def GetLocY(self):
for e in fxDriver.find_elements_by_xpath('//a[contains(@class, "video-feed-item-wrapper")]'):
locY = e.location['y']
return locY
def JudgeLoading(self, locY):
time.sleep(0.5)
locYNew = PROFILE().GetLocY()
if locY < locYNew:
return locYNew
else:
return None
def GetPostUrl(self):
urlList = []
for e in fxDriver.find_elements_by_xpath('//a[contains(@class, "video-feed-item-wrapper")]'):
url = e.get_attribute('href')
urlList.append(url)
return urlList
def GetWholePage(self):
locY = PROFILE().GetLocY()
loadFailCount = 0
while 1:
fxDriver.execute_script(pageDownJS)
while 1:
locYNew = PROFILE().JudgeLoading(locY)
if locYNew == None:
loadFailCount += 1
if loadFailCount > 20:
urlList = PROFILE().GetPostUrl()
return urlList
else:
loadFailCount = 0
locY = locYNew
break
def Main(self, profileUrl):
try:
fxDriver.get(profileUrl)
urlList = PROFILE().GetWholePage()
return urlList
except Exception as e:
print(e)
def Main():
profileUrl = input('Please input the tiktok profile link: ')
urlList = PROFILE().Main(profileUrl)
for url in urlList:
POST().Main(url)
Main()
if __name__ == '__main__':
Main()