【Python】爬取並下載Instagram帖子的信息、圖片和視頻

目錄
0. 項目介紹
1. 爬取賬戶頁中所有帖子的鏈接
2. 爬取並下載帖子頁中的信息、圖片和視頻
3. 完整代碼

0. 項目介紹

本項目的目的是輸入Instagram賬號或賬戶頁鏈接或帖子頁鏈接,輸出該賬戶帖子的:① 包含當前時間、用戶上傳帖子的時間(當前時區)、用戶名稱(Username)、用戶全稱(Full name)、帖子文字、點贊數、評論數、圖片描述(當帖子中有圖片時)、圖片鏈接(當帖子中有圖片時)、視頻觀看數(當帖子中有視頻時)、視頻鏈接(當帖子中有視頻時)的文本文檔;② 圖片(當帖子中有圖片時)、視頻(當帖子中有視頻時)。


本項目需要先導入如下庫:

from selenium import webdriver
from multiprocessing import Pool
import json, time, os

本項目的全局變量如下:

sslPort = 
fxBinaryPath = ''
geckodriverPath = ''
pageDownJS = 'document.documentElement.scrollTop = 100000000'
outputPath = ''
ariaPath = ''
httpsProxy = 'https://127.0.0.1:{}/'.format(str(sslPort))

解釋:

  • sslPort:可用於訪問Instagram的HTTPS代理的本地端口。
  • fxBinaryPath:Firefox瀏覽器的firefox.exe的絕對路徑。
  • geckodriverPathgeckodriver.exe的絕對路徑。
  • pageDownJS:用於下拉頁面的JavaScript代碼。
  • outputPath:輸出路徑。
  • ariaPatharia2c.exe的絕對路徑。
  • httpsProxy:用於GNU Wget for Windows的HTTPS代理。

本項目的基本結構如下:

def Driver():
	# Driver函數用於構造Firefox瀏覽器實例,輸出瀏覽器實例

class DOWNLOAD:
	# DOWNLOAD類是一個多進程下載工具

class POST:
	# POST類用於爬取並下載帖子頁中的信息、圖片和視頻

class PROFILE:
	# PROFILE類用於爬取賬戶頁中所有帖子的鏈接

def Main():
	# Main函數是主函數,輸入Instagram賬號或賬戶頁鏈接或帖子頁鏈接,控制各類和函數

if __name__ == '__main__':
	Main()

本項目的運行流程可見Main函數:

def Main():
	fxDriver = Driver()
	inputUrl = input('Please input instagram link or username: ')
	
	if '/p/' in inputUrl:
		POST(fxDriver, inputUrl).Main()
	else:
		if not 'www.instagram.com' in inputUrl:
			inputUrl = 'https://www.instagram.com/{}/'.format(inputUrl)
		urlList = PROFILE(fxDriver, inputUrl).Main()
		if urlList:
			l = len(urlList)
			i = 0
			for url in urlList:
				POST(fxDriver, url).Main()
				i += 1
				print('\n\n{:.2f} % completed.\n\n'.format(i / l * 100))
	
	fxDriver.quit()
	Main()

1. 爬取賬戶頁中所有帖子的鏈接

這一步的基本結構如下:

def Main(self):
	try:
		fxDriver.get(self.profileUrl)
		urlList = self.GetWholePage()
		return urlList
	except Exception as e:
		print(e)

解釋:① 瀏覽器訪問賬戶頁。② self.GetWholePage()負責爬取賬戶頁中所有帖子的鏈接,生成鏈接列表urlList


self.GetWholePage()如下:

def GetWholePage(self):
	updateCount = self.Update()
	fxDriver.execute_script(pageDownJS)
	
	try:
		fxDriver.find_element_by_xpath('//div[contains(text(), "更多帖子")]').click()
	except Exception as e:
		print(e)
	
	locY, urlDict = self.GetLocY()
	
	while 1:
		fxDriver.execute_script(pageDownJS)						
		while 1:
			locYNew, urlDictNew = self.JudgeLoading(locY, urlDict)				
			urlList = [t[0] for t in sorted(urlDictNew.items(), key = lambda x:x[1])]
			
			if len(urlList) >= updateCount:
				return urlList[: updateCount]
			
			if locYNew == None:
				continue
			else:
				locY = locYNew
				urlDict = urlDictNew
				break

解釋:

  • self.Update()用於計算需要更新的貼子數。
  • fxDriver.execute_script(pageDownJS)可以通過執行JS代碼pageDownJS把頁面拉到最下面。
  • self.GetLocY()可以獲得賬戶頁HTML中每個帖子鏈接所在tag的Y座標locY和當前加載的所有帖子的鏈接字典urlDict
  • self.JudgeLoading(locY, urlDict)可以對比輸入的Y座標和0.5秒之後的Y座標來判斷pageDownJS有沒有執行完畢。

self.Update()如下:

def Update(self):
	for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
		try:
			jsonText = e.get_attribute('textContent')
			if 'viewerId' in jsonText:
				jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['ProfilePage'][0]['graphql']['user']
				break
		except:
			continue
	
	postCount = jsonData['edge_owner_to_timeline_media']['count']
	username = jsonData['username']
	folder = '{}\\{}'.format(outputPath, username)
	
	if os.path.exists(folder):
		downloadCount = len([x for x in os.listdir(folder) if os.path.isdir('{}\\{}'.format(folder, x))])
	else:
		downloadCount = 0
	
	updateCount = postCount - downloadCount
	
	return updateCount

解釋:① 解析網頁中的貼子數。② 統計已經下載了多少帖子。③ 計算需要更新的貼子數。


self.GetLocY()如下:

def GetLocY(self):
	urlDict = {}
	
	for e in fxDriver.find_elements_by_xpath('//a[contains(@href, "/p/")]'):
		locY = e.location['y']
		locX = e.location['x']
		url = e.get_attribute('href')
		urlDict[url] = locX/1000 + locY
	
	return locY, urlDict

解釋:通過循環判斷'/p/'有沒有在a標籤的'href'屬性中來獲得帖子鏈接及其所在tag的Y座標。


self.JudgeLoading(locY, urlDict)如下:

def JudgeLoading(self, locY, urlDict):
	time.sleep(0.5)		
	locYNew, urlDictNew = self.GetLocY()
	
	if locYNew > locY:
		urlDictNew.update(urlDict)
	else:
		locYNew = None
	
	return locYNew, urlDictNew

把上述模塊如下整合到類中:

class PROFILE:
	
	def __init__(self, profileUrl):
		self.profileUrl = profileUrl
	
	def Update(self):
		for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
			try:
				jsonText = e.get_attribute('textContent')
				if 'viewerId' in jsonText:
					jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['ProfilePage'][0]['graphql']['user']
					break
			except:
				continue
		
		postCount = jsonData['edge_owner_to_timeline_media']['count']
		username = jsonData['username']
		folder = '{}\\{}'.format(outputPath, username)
		
		if os.path.exists(folder):
			downloadCount = len([x for x in os.listdir(folder) if os.path.isdir('{}\\{}'.format(folder, x))])
		else:
			downloadCount = 0
		
		updateCount = postCount - downloadCount
		
		return updateCount
	
	def GetLocY(self):
		urlDict = {}
		
		for e in fxDriver.find_elements_by_xpath('//a[contains(@href, "/p/")]'):
			locY = e.location['y']
			locX = e.location['x']
			url = e.get_attribute('href')
			urlDict[url] = locX/1000 + locY
		
		return locY, urlDict
	
	def JudgeLoading(self, locY, urlDict):
		time.sleep(0.5)		
		locYNew, urlDictNew = self.GetLocY()
		
		if locYNew > locY:
			urlDictNew.update(urlDict)
		else:
			locYNew = None
		
		return locYNew, urlDictNew
	
	def GetWholePage(self):
		updateCount = self.Update()
		fxDriver.execute_script(pageDownJS)
		
		try:
			fxDriver.find_element_by_xpath('//div[contains(text(), "更多帖子")]').click()
		except Exception as e:
			print(e)
		
		locY, urlDict = self.GetLocY()
		
		while 1:
			fxDriver.execute_script(pageDownJS)						
			while 1:
				locYNew, urlDictNew = self.JudgeLoading(locY, urlDict)				
				urlList = [t[0] for t in sorted(urlDictNew.items(), key = lambda x:x[1])]
				
				if len(urlList) >= updateCount:
					return urlList[: updateCount]
				
				if locYNew == None:
					continue
				else:
					locY = locYNew
					urlDict = urlDictNew
					break
	
	def Main(self):
		try:
			fxDriver.get(self.profileUrl)
			urlList = self.GetWholePage()
			return urlList
		except Exception as e:
			print(e)

解釋:可以通過執行PROFILE(inputUrl).Main()獲得賬戶頁中所有帖子的鏈接。

3. 爬取並下載帖子頁中的信息、圖片和視頻

這一步的基本結構如下:

def Main(self):
	try:
		fxDriver.get(self.url)
		info = self.GetInfo()
		self.DownloadInfo(info)
		self.DownloadFile(info)
	except Exception as e:
		print(e)

解釋:① 瀏覽器訪問帖子頁。② self.GetInfo()可以獲得用戶上傳帖子的時間(當前時區)、用戶名稱(Username)、用戶全稱(Full name)、帖子文字、點贊數、評論數、圖片描述(當帖子中有圖片時)、圖片鏈接(當帖子中有圖片時)、視頻觀看數(當帖子中有視頻時)、視頻鏈接(當帖子中有視頻時)等信息。③ self.DownloadInfo(info)把信息寫入文本文檔。④ self.DownloadFile(info)根據獲取的信息下載帖子頁中的圖片和視頻。


self.GetInfo()如下:

def GetInfo(self):		
	for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
		try:
			jsonText = e.get_attribute('textContent')
			if '"viewerId":null' in jsonText:
				jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['PostPage'][0]['graphql']['shortcode_media']
				break
		except:
			continue
	
	uploadTimeStamp = jsonData['taken_at_timestamp']
	uploadTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(uploadTimeStamp))
	username = jsonData['owner']['username']
	fullName = jsonData['owner']['full_name']
	likes = jsonData['edge_media_preview_like']['count']
	comments = jsonData['edge_media_preview_comment']['count']
	try:
		text = jsonData['edge_media_to_caption']['edges'][0]['node']['text']
	except:
		text = ''
	
	try:
		mediaDict = {}
		for obj in jsonData['edge_sidecar_to_children']['edges']:
			try:
				vidUrl = obj['node']['video_url']
				vidViewCount = obj['node']['video_view_count']
				mediaDict[vidUrl] = vidViewCount
			except:
				picUrl = obj['node']['display_url']
				picDescription = obj['node']['accessibility_caption']
				mediaDict[picUrl] = picDescription
		return uploadTime, username, fullName, likes, comments, text, mediaDict, 'm'
	except:
		try:
			vidUrl = jsonData['video_url']
			vidViewCount = jsonData['video_view_count']
			return uploadTime, username, fullName, likes, comments, text, vidUrl, vidViewCount, 'v'
		except:
			picUrl = jsonData['display_url']
			picDescription = jsonData['accessibility_caption']
			return uploadTime, username, fullName, likes, comments, text, picUrl, picDescription, 'p'

解釋:① 無需登錄即可獲取包含所有信息的JSON數據。② 獲取一些通用的信息,如發佈時間、用戶名、點贊數、評論數等。③ 獲取圖片或視頻的鏈接和相關信息。帖子可以分爲多圖(視頻)和單圖(視頻),而在多圖(視頻)帖子中需要判斷是圖片還是視頻,在單圖(視頻)帖子中也要判斷。


self.DownloadInfo(info)如下:

def DownloadInfo(self, info):
	now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(time.time())))
	uploadTime = info[0]
	username = info[1]
	fullName = info[2]
	likes = info[3]
	comments = info[4]
	text = info[5]
	folder = '{}\\{}\\{}'.format(outputPath, username, uploadTime.replace('-', '').replace(':', '').replace(' ', ''))
	
	try:
		os.makedirs(folder)
	except Exception as e:
		print(e)
	
	with open('{}\\info.txt'.format(folder), 'w', encoding='utf-8') as f:
		f.write('Now: {}'.format(now))
		f.write('\nUpload time: {}'.format(uploadTime))
		f.write('\nUsername: {}'.format(username))
		f.write('\nFull name: {}'.format(fullName))
		f.write('\nText: {}'.format(text))
		f.write('\nLikes: {}'.format(likes))
		f.write('\nComments: {}'.format(comments))
		
		if info[-1] == 'm':
			mediaDict = info[6]
			i = 1
			for mediaUrl, mediaInfo in mediaDict.items():
				if str(mediaInfo).isdigit():
					f.write('\n{}. Video view count: {}'.format(str(i), str(mediaInfo)))
					f.write('\n{}. Video url: {}'.format(str(i), mediaUrl))
				else:
					f.write('\n{}. Picture description: {}'.format(str(i), mediaInfo))
					f.write('\n{}. Picture url: {}'.format(str(i), mediaUrl))
				i += 1
		elif info[-1] == 'v':
			vidUrl = info[6]
			vidViewCount = info[7]
			f.write('\nVideo view count: {}'.format(vidViewCount))
			f.write('\nVideo url: {}'.format(vidUrl))
		elif info[-1] == 'p':
			picUrl = info[6]
			picDescription = info[7]
			f.write('\nPicture description: {}'.format(picDescription))
			f.write('\nPicture url: {}'.format(picUrl))

self.DownloadFile(info)如下:

def DownloadFile(self, info):
	uploadTime = info[0]
	username = info[1]
	folder = '{}\\{}\\{}'.format(outputPath, username, uploadTime.replace('-', '').replace(':', '').replace(' ', ''))
	
	if info[-1] == 'm':
		mediaDict = info[6]
		i = 1
		for mediaUrl, mediaInfo in mediaDict.items():
			if str(mediaInfo).isdigit():
				os.system('{} --output-document={}\\{}.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, mediaUrl))
			else:
				os.system('{} --output-document={}\\{}.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, mediaUrl))
			i += 1
	elif info[-1] == 'v':
		vidUrl = info[6]
		os.system('{} --output-document={}\\1.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, vidUrl))
	elif info[-1] == 'p':
		picUrl = info[6]
		os.system('{} --output-document={}\\1.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, picUrl))

解釋:參考這篇博文


把上述模塊如下整合到類中:

class POST:
	
	def __init__(self, url):
		self.url = url
	
	def GetInfo(self):		
		for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
			try:
				jsonText = e.get_attribute('textContent')
				if 'viewerId' in jsonText:
					jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['PostPage'][0]['graphql']['shortcode_media']
					break
			except:
				continue
		
		uploadTimeStamp = jsonData['taken_at_timestamp']
		uploadTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(uploadTimeStamp))
		username = jsonData['owner']['username']
		fullName = jsonData['owner']['full_name']
		likes = jsonData['edge_media_preview_like']['count']
		comments = jsonData['edge_media_preview_comment']['count']
		try:
			text = jsonData['edge_media_to_caption']['edges'][0]['node']['text']
		except:
			text = ''
		
		try:
			mediaList = []
			for obj in jsonData['edge_sidecar_to_children']['edges']:
				try:
					vidUrl = obj['node']['video_url']
					vidViewCount = obj['node']['video_view_count']
					mediaList.append((vidUrl, vidViewCount))
				except:
					picUrl = obj['node']['display_url']
					picDescription = obj['node']['accessibility_caption']
					mediaList.append((picUrl, picDescription))
			return uploadTime, username, fullName, likes, comments, text, mediaList, 'm'
		except:
			try:
				vidUrl = jsonData['video_url']
				vidViewCount = jsonData['video_view_count']
				return uploadTime, username, fullName, likes, comments, text, vidUrl, vidViewCount, 'v'
			except:
				picUrl = jsonData['display_url']
				picDescription = jsonData['accessibility_caption']
				return uploadTime, username, fullName, likes, comments, text, picUrl, picDescription, 'p'
	
	def DownloadInfo(self, info):
		now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(time.time())))
		uploadTime = info[0]
		username = info[1]
		fullName = info[2]
		likes = info[3]
		comments = info[4]
		text = info[5]
		folder = '{}\\{}\\{}'.format(outputPath, username, ''.join([x for x in uploadTime if x.isdigit()]))
		
		try:
			os.makedirs(folder)
		except Exception as e:
			print(e)
		
		with open('{}\\info.txt'.format(folder), 'w', encoding='utf-8') as f:
			f.write('Now: {}'.format(now))
			f.write('\nUpload time: {}'.format(uploadTime))
			f.write('\nUsername: {}'.format(username))
			f.write('\nFull name: {}'.format(fullName))
			f.write('\nText: {}'.format(text))
			f.write('\nLikes: {}'.format(likes))
			f.write('\nComments: {}'.format(comments))
			
			if info[-1] == 'm':
				mediaList = info[6]
				i = 1
				for mediaTuple in mediaList:
					if str(mediaTuple[1]).isdigit():
						vidUrl = mediaTuple[0]; vidViewCount = str(mediaTuple[1])
						f.write('\n{}. Video view count: {} Video url: {}'.format(str(i), vidViewCount, vidUrl))
					else:
						picUrl = mediaTuple[0]; picDescription = mediaTuple[1]
						f.write('\n{}. Picture description: {} Picture url: {}'.format(str(i), picDescription, picUrl))
					i += 1
			elif info[-1] == 'v':
				vidUrl = info[6]
				vidViewCount = info[7]
				f.write('\nVideo view count: {}'.format(vidViewCount))
				f.write('\nVideo url: {}'.format(vidUrl))
			elif info[-1] == 'p':
				picUrl = info[6]
				picDescription = info[7]
				f.write('\nPicture description: {}'.format(picDescription))
				f.write('\nPicture url: {}'.format(picUrl))
	
	def DownloadFile(self, info):
		uploadTime = info[0]
		username = info[1]
		folder = '{}\\{}\\{}'.format(outputPath, username, ''.join([x for x in uploadTime if x.isdigit()]))
		
		try:
			os.makedirs(folder)
		except Exception as e:
			print(e)
		
		if info[-1] == 'm':
			mediaList = info[6]
			i = 1
			for mediaTuple in mediaList:
				if str(mediaTuple[1]).isdigit():
					vidUrl = mediaTuple[0]
					os.system('{} --output-document={}\\{}.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, vidUrl))
				else:
					picUrl = mediaTuple[0]
					os.system('{} --output-document={}\\{}.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, picUrl))
				i += 1
		elif info[-1] == 'v':
			vidUrl = info[6]
			os.system('{} --output-document={}\\1.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, vidUrl))
		elif info[-1] == 'p':
			picUrl = info[6]
			os.system('{} --output-document={}\\1.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, picUrl))
	
	def Main(self):
		try:
			fxDriver.get(self.url)
			info = self.GetInfo()
			self.DownloadInfo(info)
			self.DownloadFile(info)
		except Exception as e:
			print(e)

解釋:可以通過執行POST(url).Main()下載每一個帖子的信息、圖片和視頻。

4. 完整代碼

from selenium import webdriver
import json, time, os

sslPort = 
fxBinaryPath = ''
geckodriverPath = ''
pageDownJS = 'document.documentElement.scrollTop = 100000000'
outputPath = ''
wgetPath = ''
httpsProxy = 'https://127.0.0.1:{}/'.format(str(sslPort))

fxProfile = webdriver.firefox.firefox_profile.FirefoxProfile()
fxProfile.set_preference('network.proxy.type', 1)
fxProfile.set_preference('network.proxy.ssl', '127.0.0.1')
fxProfile.set_preference('network.proxy.ssl_port', sslPort)
fxProfile.set_preference('network.proxy.socks_remote_dns', True)
fxProfile.set_preference('network.trr.mode', 2)
fxProfile.set_preference('permissions.default.image', 2)
fxProfile.set_preference('intl.accept_languages', 'zh-CN, zh, zh-TW, zh-HK, en-US, en')
fxDriver = webdriver.firefox.webdriver.WebDriver(firefox_profile=fxProfile, firefox_binary=fxBinaryPath, executable_path=geckodriverPath)

class POST:
	
	def __init__(self, url):
		self.url = url
	
	def GetInfo(self):		
		for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
			try:
				jsonText = e.get_attribute('textContent')
				if 'viewerId' in jsonText:
					jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['PostPage'][0]['graphql']['shortcode_media']
					break
			except:
				continue
		
		uploadTimeStamp = jsonData['taken_at_timestamp']
		uploadTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(uploadTimeStamp))
		username = jsonData['owner']['username']
		fullName = jsonData['owner']['full_name']
		likes = jsonData['edge_media_preview_like']['count']
		comments = jsonData['edge_media_preview_comment']['count']
		try:
			text = jsonData['edge_media_to_caption']['edges'][0]['node']['text']
		except:
			text = ''
		
		try:
			mediaList = []
			for obj in jsonData['edge_sidecar_to_children']['edges']:
				try:
					vidUrl = obj['node']['video_url']
					vidViewCount = obj['node']['video_view_count']
					mediaList.append((vidUrl, vidViewCount))
				except:
					picUrl = obj['node']['display_url']
					picDescription = obj['node']['accessibility_caption']
					mediaList.append((picUrl, picDescription))
			return uploadTime, username, fullName, likes, comments, text, mediaList, 'm'
		except:
			try:
				vidUrl = jsonData['video_url']
				vidViewCount = jsonData['video_view_count']
				return uploadTime, username, fullName, likes, comments, text, vidUrl, vidViewCount, 'v'
			except:
				picUrl = jsonData['display_url']
				picDescription = jsonData['accessibility_caption']
				return uploadTime, username, fullName, likes, comments, text, picUrl, picDescription, 'p'
	
	def DownloadInfo(self, info):
		now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(time.time())))
		uploadTime = info[0]
		username = info[1]
		fullName = info[2]
		likes = info[3]
		comments = info[4]
		text = info[5]
		folder = '{}\\{}\\{}'.format(outputPath, username, ''.join([x for x in uploadTime if x.isdigit()]))
		
		try:
			os.makedirs(folder)
		except Exception as e:
			print(e)
		
		with open('{}\\info.txt'.format(folder), 'w', encoding='utf-8') as f:
			f.write('Now: {}'.format(now))
			f.write('\nUpload time: {}'.format(uploadTime))
			f.write('\nUsername: {}'.format(username))
			f.write('\nFull name: {}'.format(fullName))
			f.write('\nText: {}'.format(text))
			f.write('\nLikes: {}'.format(likes))
			f.write('\nComments: {}'.format(comments))
			
			if info[-1] == 'm':
				mediaList = info[6]
				i = 1
				for mediaTuple in mediaList:
					if str(mediaTuple[1]).isdigit():
						vidUrl = mediaTuple[0]; vidViewCount = str(mediaTuple[1])
						f.write('\n{}. Video view count: {} Video url: {}'.format(str(i), vidViewCount, vidUrl))
					else:
						picUrl = mediaTuple[0]; picDescription = mediaTuple[1]
						f.write('\n{}. Picture description: {} Picture url: {}'.format(str(i), picDescription, picUrl))
					i += 1
			elif info[-1] == 'v':
				vidUrl = info[6]
				vidViewCount = info[7]
				f.write('\nVideo view count: {}'.format(vidViewCount))
				f.write('\nVideo url: {}'.format(vidUrl))
			elif info[-1] == 'p':
				picUrl = info[6]
				picDescription = info[7]
				f.write('\nPicture description: {}'.format(picDescription))
				f.write('\nPicture url: {}'.format(picUrl))
	
	def DownloadFile(self, info):
		uploadTime = info[0]
		username = info[1]
		folder = '{}\\{}\\{}'.format(outputPath, username, ''.join([x for x in uploadTime if x.isdigit()]))
		
		try:
			os.makedirs(folder)
		except Exception as e:
			print(e)
		
		if info[-1] == 'm':
			mediaList = info[6]
			i = 1
			for mediaTuple in mediaList:
				if str(mediaTuple[1]).isdigit():
					vidUrl = mediaTuple[0]
					os.system('{} --output-document={}\\{}.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, vidUrl))
				else:
					picUrl = mediaTuple[0]
					os.system('{} --output-document={}\\{}.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, picUrl))
				i += 1
		elif info[-1] == 'v':
			vidUrl = info[6]
			os.system('{} --output-document={}\\1.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, vidUrl))
		elif info[-1] == 'p':
			picUrl = info[6]
			os.system('{} --output-document={}\\1.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, picUrl))
	
	def Main(self):
		try:
			fxDriver.get(self.url)
			info = self.GetInfo()
			self.DownloadInfo(info)
			self.DownloadFile(info)
		except Exception as e:
			print(e)

class PROFILE:
	
	def __init__(self, profileUrl):
		self.profileUrl = profileUrl
	
	def Update(self):
		for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
			try:
				jsonText = e.get_attribute('textContent')
				if 'viewerId' in jsonText:
					jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['ProfilePage'][0]['graphql']['user']
					break
			except:
				continue
		
		postCount = jsonData['edge_owner_to_timeline_media']['count']
		username = jsonData['username']
		folder = '{}\\{}'.format(outputPath, username)
		
		if os.path.exists(folder):
			downloadCount = len([x for x in os.listdir(folder) if os.path.isdir('{}\\{}'.format(folder, x))])
		else:
			downloadCount = 0
		
		updateCount = postCount - downloadCount
		
		return updateCount
	
	def GetLocY(self):
		urlDict = {}
		
		for e in fxDriver.find_elements_by_xpath('//a[contains(@href, "/p/")]'):
			locY = e.location['y']
			locX = e.location['x']
			url = e.get_attribute('href')
			urlDict[url] = locX/1000 + locY
		
		return locY, urlDict
	
	def JudgeLoading(self, locY, urlDict):
		time.sleep(0.5)		
		locYNew, urlDictNew = self.GetLocY()
		
		if locYNew > locY:
			urlDictNew.update(urlDict)
		else:
			locYNew = None
		
		return locYNew, urlDictNew
	
	def GetWholePage(self):
		updateCount = self.Update()
		fxDriver.execute_script(pageDownJS)
		
		try:
			fxDriver.find_element_by_xpath('//div[contains(text(), "更多帖子")]').click()
		except Exception as e:
			print(e)
		
		locY, urlDict = self.GetLocY()
		
		while 1:
			fxDriver.execute_script(pageDownJS)						
			while 1:
				locYNew, urlDictNew = self.JudgeLoading(locY, urlDict)				
				urlList = [t[0] for t in sorted(urlDictNew.items(), key = lambda x:x[1])]
				
				if len(urlList) >= updateCount:
					return urlList[: updateCount]
				
				if locYNew == None:
					continue
				else:
					locY = locYNew
					urlDict = urlDictNew
					break
	
	def Main(self):
		try:
			fxDriver.get(self.profileUrl)
			urlList = self.GetWholePage()
			return urlList
		except Exception as e:
			print(e)

def Main():
	inputUrl = input('Please input the instagram link: ')
	
	if '/p/' in inputUrl:
		POST(inputUrl).Main()
	else:
		if not 'www.instagram.com' in inputUrl:
			inputUrl = 'https://www.instagram.com/{}/'.format(inputUrl)
		
		urlList = PROFILE(inputUrl).Main()
		
		if urlList:
			for url in urlList:
				POST(url).Main()
	
	Main()

if __name__ == '__main__':
	Main()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章