【Python】爬取並下載TikTok賬戶中所有帖子的視頻

目錄
0. 項目介紹
1. 構造瀏覽器實例
2. 爬取賬戶頁中所有帖子的鏈接
3. 爬取並下載帖子頁中的視頻
4. 完整代碼

0. 項目介紹

本項目的目的是輸入指定TikTok賬戶頁的鏈接,輸出該賬戶每一個帖子的視頻。本項目的基本結構參考這篇博文


本項目需要先導入如下庫:

from selenium import webdriver
import json, time, os

本項目的全局變量如下:

sslPort = 
fxBinaryPath = ''
geckodriverPath = ''
pageDownJS = 'document.documentElement.scrollTop=100000000'
outputPath = ''
wgetPath = ''
httpsProxy = 'https://127.0.0.1:{}/'.format(str(sslPort))

本項目的基本結構如下:

def Main():
	profileUrl = input('Please input the tiktok profile link: ')
	
	urlList = PROFILE().Main(profileUrl)
	
	for url in urlList:
		POST().Main(url)

1. 構造瀏覽器實例

fxProfile = webdriver.firefox.firefox_profile.FirefoxProfile()
fxProfile.set_preference('network.proxy.type', 1)
fxProfile.set_preference('network.proxy.ssl', '127.0.0.1')
fxProfile.set_preference('network.proxy.ssl_port', sslPort)
fxProfile.set_preference('network.proxy.socks_remote_dns', True)
fxProfile.set_preference('network.trr.mode', 2)
fxProfile.set_preference('permissions.default.image', 2)
fxProfile.set_preference('intl.accept_languages', 'zh-CN, zh, zh-TW, zh-HK, en-US, en')
fxDriver = webdriver.firefox.webdriver.WebDriver(firefox_profile=fxProfile, firefox_binary=fxBinaryPath, executable_path=geckodriverPath)

2. 爬取賬戶頁中所有帖子的鏈接

class PROFILE(object):
	
	def GetLocY(self):		
		for e in fxDriver.find_elements_by_xpath('//a[contains(@class, "video-feed-item-wrapper")]'):
			locY = e.location['y']
		
		return locY
	
	def JudgeLoading(self, locY):
		time.sleep(0.5)
		
		locYNew = PROFILE().GetLocY()
		
		if locY < locYNew:
			return locYNew
		else:
			return None
	
	def GetPostUrl(self):
		urlList = []
		
		for e in fxDriver.find_elements_by_xpath('//a[contains(@class, "video-feed-item-wrapper")]'):
			url = e.get_attribute('href')
			urlList.append(url)
		
		return urlList
	
	def GetWholePage(self):
		locY = PROFILE().GetLocY()
		loadFailCount = 0
		
		while 1:			
			fxDriver.execute_script(pageDownJS)
			
			while 1:
				locYNew = PROFILE().JudgeLoading(locY)
				
				if locYNew == None:
					loadFailCount += 1
					if loadFailCount > 20:
						urlList = PROFILE().GetPostUrl()
						return urlList
				else:
					loadFailCount = 0
					locY = locYNew
					break
		
	def Main(self, profileUrl):
		try:
			fxDriver.get(profileUrl)
			urlList = PROFILE().GetWholePage()
			return urlList
		except Exception as e:
			print(e)

3. 爬取並下載帖子頁中的視頻

class POST(object):
	
	def GetInfo(self, html):
		jsonText = fxDriver.find_element_by_xpath('//script[@type="application/json"]').get_attribute('textContent')
		jsonData = json.loads(jsonText)['props']
		
		videoData = jsonData['pageProps']['videoData']
		vidUrl = videoData['itemInfos']['video']['urls'][0]
		createTimeStamp = int(videoData['itemInfos']['createTime'])
		createTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(createTimeStamp))
		uniqueID = videoData['authorInfos']['uniqueId']
		
		return vidUrl, createTime, uniqueID
	
	def DownloadFile(self, info):
		vidUrl = info[0]
		createTime = info[1].replace('-', '').replace(':', '').replace(' ', '')
		uniqueID = info[2]
		
		folder = '{}\\{}'.format(outputPath, uniqueID)
		
		try:
			os.makedirs(folder)
		except Exception as e:
			print(e)
		
		os.system('{} --output-document={}\\{}.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, createTime, httpsProxy, vidUrl))
	
	def Main(self, url):
		try:
			fxDriver.get(url)
			info = POST().GetInfo(html)
			POST().DownloadFile(info)
		except Exception as e:
			print(e)

4. 完整代碼

from selenium import webdriver
import json, time, os

sslPort = 
fxBinaryPath = ''
geckodriverPath = ''
pageDownJS = 'document.documentElement.scrollTop=100000000'
outputPath = ''
wgetPath = ''
httpsProxy = 'https://127.0.0.1:{}/'.format(str(sslPort))

fxProfile = webdriver.firefox.firefox_profile.FirefoxProfile()
fxProfile.set_preference('network.proxy.type', 1)
fxProfile.set_preference('network.proxy.ssl', '127.0.0.1')
fxProfile.set_preference('network.proxy.ssl_port', sslPort)
fxProfile.set_preference('network.proxy.socks_remote_dns', True)
fxProfile.set_preference('network.trr.mode', 2)
fxProfile.set_preference('permissions.default.image', 2)
fxProfile.set_preference('intl.accept_languages', 'zh-CN, zh, zh-TW, zh-HK, en-US, en')
fxDriver = webdriver.firefox.webdriver.WebDriver(firefox_profile=fxProfile, firefox_binary=fxBinaryPath, executable_path=geckodriverPath)

class POST(object):
	
	def GetInfo(self, html):
		jsonText = fxDriver.find_element_by_xpath('//script[@type="application/json"]').get_attribute('textContent')
		jsonData = json.loads(jsonText)['props']
		
		videoData = jsonData['pageProps']['videoData']
		vidUrl = videoData['itemInfos']['video']['urls'][0]
		createTimeStamp = int(videoData['itemInfos']['createTime'])
		createTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(createTimeStamp))
		uniqueID = videoData['authorInfos']['uniqueId']
		
		return vidUrl, createTime, uniqueID
	
	def DownloadFile(self, info):
		vidUrl = info[0]
		createTime = info[1].replace('-', '').replace(':', '').replace(' ', '')
		uniqueID = info[2]
		
		folder = '{}\\{}'.format(outputPath, uniqueID)
		
		try:
			os.makedirs(folder)
		except Exception as e:
			print(e)
		
		os.system('{} --output-document={}\\{}.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, createTime, httpsProxy, vidUrl))
	
	def Main(self, url):
		try:
			fxDriver.get(url)
			info = POST().GetInfo(html)
			POST().DownloadFile(info)
		except Exception as e:
			print(e)

class PROFILE(object):
	
	def GetLocY(self):		
		for e in fxDriver.find_elements_by_xpath('//a[contains(@class, "video-feed-item-wrapper")]'):
			locY = e.location['y']
		
		return locY
	
	def JudgeLoading(self, locY):
		time.sleep(0.5)
		
		locYNew = PROFILE().GetLocY()
		
		if locY < locYNew:
			return locYNew
		else:
			return None
	
	def GetPostUrl(self):
		urlList = []
		
		for e in fxDriver.find_elements_by_xpath('//a[contains(@class, "video-feed-item-wrapper")]'):
			url = e.get_attribute('href')
			urlList.append(url)
		
		return urlList
	
	def GetWholePage(self):
		locY = PROFILE().GetLocY()
		loadFailCount = 0
		
		while 1:			
			fxDriver.execute_script(pageDownJS)
			
			while 1:
				locYNew = PROFILE().JudgeLoading(locY)
				
				if locYNew == None:
					loadFailCount += 1
					if loadFailCount > 20:
						urlList = PROFILE().GetPostUrl()
						return urlList
				else:
					loadFailCount = 0
					locY = locYNew
					break
		
	def Main(self, profileUrl):
		try:
			fxDriver.get(profileUrl)
			urlList = PROFILE().GetWholePage()
			return urlList
		except Exception as e:
			print(e)

def Main():
	profileUrl = input('Please input the tiktok profile link: ')
	
	urlList = PROFILE().Main(profileUrl)
	
	for url in urlList:
		POST().Main(url)
	
	Main()

if __name__ == '__main__':
	Main()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章