【Python】爬取并下载Instagram帖子的信息、图片和视频

目录
0. 项目介绍
1. 爬取账户页中所有帖子的链接
2. 爬取并下载帖子页中的信息、图片和视频
3. 完整代码

0. 项目介绍

本项目的目的是输入Instagram账号或账户页链接或帖子页链接,输出该账户帖子的:① 包含当前时间、用户上传帖子的时间(当前时区)、用户名称(Username)、用户全称(Full name)、帖子文字、点赞数、评论数、图片描述(当帖子中有图片时)、图片链接(当帖子中有图片时)、视频观看数(当帖子中有视频时)、视频链接(当帖子中有视频时)的文本文档;② 图片(当帖子中有图片时)、视频(当帖子中有视频时)。


本项目需要先导入如下库:

from selenium import webdriver
from multiprocessing import Pool
import json, time, os

本项目的全局变量如下:

sslPort = 
fxBinaryPath = ''
geckodriverPath = ''
pageDownJS = 'document.documentElement.scrollTop = 100000000'
outputPath = ''
ariaPath = ''
httpsProxy = 'https://127.0.0.1:{}/'.format(str(sslPort))

解释:

  • sslPort:可用于访问Instagram的HTTPS代理的本地端口。
  • fxBinaryPath:Firefox浏览器的firefox.exe的绝对路径。
  • geckodriverPathgeckodriver.exe的绝对路径。
  • pageDownJS:用于下拉页面的JavaScript代码。
  • outputPath:输出路径。
  • ariaPatharia2c.exe的绝对路径。
  • httpsProxy:用于GNU Wget for Windows的HTTPS代理。

本项目的基本结构如下:

def Driver():
	# Driver函数用于构造Firefox浏览器实例,输出浏览器实例

class DOWNLOAD:
	# DOWNLOAD类是一个多进程下载工具

class POST:
	# POST类用于爬取并下载帖子页中的信息、图片和视频

class PROFILE:
	# PROFILE类用于爬取账户页中所有帖子的链接

def Main():
	# Main函数是主函数,输入Instagram账号或账户页链接或帖子页链接,控制各类和函数

if __name__ == '__main__':
	Main()

本项目的运行流程可见Main函数:

def Main():
	fxDriver = Driver()
	inputUrl = input('Please input instagram link or username: ')
	
	if '/p/' in inputUrl:
		POST(fxDriver, inputUrl).Main()
	else:
		if not 'www.instagram.com' in inputUrl:
			inputUrl = 'https://www.instagram.com/{}/'.format(inputUrl)
		urlList = PROFILE(fxDriver, inputUrl).Main()
		if urlList:
			l = len(urlList)
			i = 0
			for url in urlList:
				POST(fxDriver, url).Main()
				i += 1
				print('\n\n{:.2f} % completed.\n\n'.format(i / l * 100))
	
	fxDriver.quit()
	Main()

1. 爬取账户页中所有帖子的链接

这一步的基本结构如下:

def Main(self):
	try:
		fxDriver.get(self.profileUrl)
		urlList = self.GetWholePage()
		return urlList
	except Exception as e:
		print(e)

解释:① 浏览器访问账户页。② self.GetWholePage()负责爬取账户页中所有帖子的链接,生成链接列表urlList


self.GetWholePage()如下:

def GetWholePage(self):
	updateCount = self.Update()
	fxDriver.execute_script(pageDownJS)
	
	try:
		fxDriver.find_element_by_xpath('//div[contains(text(), "更多帖子")]').click()
	except Exception as e:
		print(e)
	
	locY, urlDict = self.GetLocY()
	
	while 1:
		fxDriver.execute_script(pageDownJS)						
		while 1:
			locYNew, urlDictNew = self.JudgeLoading(locY, urlDict)				
			urlList = [t[0] for t in sorted(urlDictNew.items(), key = lambda x:x[1])]
			
			if len(urlList) >= updateCount:
				return urlList[: updateCount]
			
			if locYNew == None:
				continue
			else:
				locY = locYNew
				urlDict = urlDictNew
				break

解释:

  • self.Update()用于计算需要更新的贴子数。
  • fxDriver.execute_script(pageDownJS)可以通过执行JS代码pageDownJS把页面拉到最下面。
  • self.GetLocY()可以获得账户页HTML中每个帖子链接所在tag的Y座标locY和当前加载的所有帖子的链接字典urlDict
  • self.JudgeLoading(locY, urlDict)可以对比输入的Y座标和0.5秒之后的Y座标来判断pageDownJS有没有执行完毕。

self.Update()如下:

def Update(self):
	for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
		try:
			jsonText = e.get_attribute('textContent')
			if 'viewerId' in jsonText:
				jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['ProfilePage'][0]['graphql']['user']
				break
		except:
			continue
	
	postCount = jsonData['edge_owner_to_timeline_media']['count']
	username = jsonData['username']
	folder = '{}\\{}'.format(outputPath, username)
	
	if os.path.exists(folder):
		downloadCount = len([x for x in os.listdir(folder) if os.path.isdir('{}\\{}'.format(folder, x))])
	else:
		downloadCount = 0
	
	updateCount = postCount - downloadCount
	
	return updateCount

解释:① 解析网页中的贴子数。② 统计已经下载了多少帖子。③ 计算需要更新的贴子数。


self.GetLocY()如下:

def GetLocY(self):
	urlDict = {}
	
	for e in fxDriver.find_elements_by_xpath('//a[contains(@href, "/p/")]'):
		locY = e.location['y']
		locX = e.location['x']
		url = e.get_attribute('href')
		urlDict[url] = locX/1000 + locY
	
	return locY, urlDict

解释:通过循环判断'/p/'有没有在a标签的'href'属性中来获得帖子链接及其所在tag的Y座标。


self.JudgeLoading(locY, urlDict)如下:

def JudgeLoading(self, locY, urlDict):
	time.sleep(0.5)		
	locYNew, urlDictNew = self.GetLocY()
	
	if locYNew > locY:
		urlDictNew.update(urlDict)
	else:
		locYNew = None
	
	return locYNew, urlDictNew

把上述模块如下整合到类中:

class PROFILE:
	
	def __init__(self, profileUrl):
		self.profileUrl = profileUrl
	
	def Update(self):
		for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
			try:
				jsonText = e.get_attribute('textContent')
				if 'viewerId' in jsonText:
					jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['ProfilePage'][0]['graphql']['user']
					break
			except:
				continue
		
		postCount = jsonData['edge_owner_to_timeline_media']['count']
		username = jsonData['username']
		folder = '{}\\{}'.format(outputPath, username)
		
		if os.path.exists(folder):
			downloadCount = len([x for x in os.listdir(folder) if os.path.isdir('{}\\{}'.format(folder, x))])
		else:
			downloadCount = 0
		
		updateCount = postCount - downloadCount
		
		return updateCount
	
	def GetLocY(self):
		urlDict = {}
		
		for e in fxDriver.find_elements_by_xpath('//a[contains(@href, "/p/")]'):
			locY = e.location['y']
			locX = e.location['x']
			url = e.get_attribute('href')
			urlDict[url] = locX/1000 + locY
		
		return locY, urlDict
	
	def JudgeLoading(self, locY, urlDict):
		time.sleep(0.5)		
		locYNew, urlDictNew = self.GetLocY()
		
		if locYNew > locY:
			urlDictNew.update(urlDict)
		else:
			locYNew = None
		
		return locYNew, urlDictNew
	
	def GetWholePage(self):
		updateCount = self.Update()
		fxDriver.execute_script(pageDownJS)
		
		try:
			fxDriver.find_element_by_xpath('//div[contains(text(), "更多帖子")]').click()
		except Exception as e:
			print(e)
		
		locY, urlDict = self.GetLocY()
		
		while 1:
			fxDriver.execute_script(pageDownJS)						
			while 1:
				locYNew, urlDictNew = self.JudgeLoading(locY, urlDict)				
				urlList = [t[0] for t in sorted(urlDictNew.items(), key = lambda x:x[1])]
				
				if len(urlList) >= updateCount:
					return urlList[: updateCount]
				
				if locYNew == None:
					continue
				else:
					locY = locYNew
					urlDict = urlDictNew
					break
	
	def Main(self):
		try:
			fxDriver.get(self.profileUrl)
			urlList = self.GetWholePage()
			return urlList
		except Exception as e:
			print(e)

解释:可以通过执行PROFILE(inputUrl).Main()获得账户页中所有帖子的链接。

3. 爬取并下载帖子页中的信息、图片和视频

这一步的基本结构如下:

def Main(self):
	try:
		fxDriver.get(self.url)
		info = self.GetInfo()
		self.DownloadInfo(info)
		self.DownloadFile(info)
	except Exception as e:
		print(e)

解释:① 浏览器访问帖子页。② self.GetInfo()可以获得用户上传帖子的时间(当前时区)、用户名称(Username)、用户全称(Full name)、帖子文字、点赞数、评论数、图片描述(当帖子中有图片时)、图片链接(当帖子中有图片时)、视频观看数(当帖子中有视频时)、视频链接(当帖子中有视频时)等信息。③ self.DownloadInfo(info)把信息写入文本文档。④ self.DownloadFile(info)根据获取的信息下载帖子页中的图片和视频。


self.GetInfo()如下:

def GetInfo(self):		
	for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
		try:
			jsonText = e.get_attribute('textContent')
			if '"viewerId":null' in jsonText:
				jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['PostPage'][0]['graphql']['shortcode_media']
				break
		except:
			continue
	
	uploadTimeStamp = jsonData['taken_at_timestamp']
	uploadTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(uploadTimeStamp))
	username = jsonData['owner']['username']
	fullName = jsonData['owner']['full_name']
	likes = jsonData['edge_media_preview_like']['count']
	comments = jsonData['edge_media_preview_comment']['count']
	try:
		text = jsonData['edge_media_to_caption']['edges'][0]['node']['text']
	except:
		text = ''
	
	try:
		mediaDict = {}
		for obj in jsonData['edge_sidecar_to_children']['edges']:
			try:
				vidUrl = obj['node']['video_url']
				vidViewCount = obj['node']['video_view_count']
				mediaDict[vidUrl] = vidViewCount
			except:
				picUrl = obj['node']['display_url']
				picDescription = obj['node']['accessibility_caption']
				mediaDict[picUrl] = picDescription
		return uploadTime, username, fullName, likes, comments, text, mediaDict, 'm'
	except:
		try:
			vidUrl = jsonData['video_url']
			vidViewCount = jsonData['video_view_count']
			return uploadTime, username, fullName, likes, comments, text, vidUrl, vidViewCount, 'v'
		except:
			picUrl = jsonData['display_url']
			picDescription = jsonData['accessibility_caption']
			return uploadTime, username, fullName, likes, comments, text, picUrl, picDescription, 'p'

解释:① 无需登录即可获取包含所有信息的JSON数据。② 获取一些通用的信息,如发布时间、用户名、点赞数、评论数等。③ 获取图片或视频的链接和相关信息。帖子可以分为多图(视频)和单图(视频),而在多图(视频)帖子中需要判断是图片还是视频,在单图(视频)帖子中也要判断。


self.DownloadInfo(info)如下:

def DownloadInfo(self, info):
	now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(time.time())))
	uploadTime = info[0]
	username = info[1]
	fullName = info[2]
	likes = info[3]
	comments = info[4]
	text = info[5]
	folder = '{}\\{}\\{}'.format(outputPath, username, uploadTime.replace('-', '').replace(':', '').replace(' ', ''))
	
	try:
		os.makedirs(folder)
	except Exception as e:
		print(e)
	
	with open('{}\\info.txt'.format(folder), 'w', encoding='utf-8') as f:
		f.write('Now: {}'.format(now))
		f.write('\nUpload time: {}'.format(uploadTime))
		f.write('\nUsername: {}'.format(username))
		f.write('\nFull name: {}'.format(fullName))
		f.write('\nText: {}'.format(text))
		f.write('\nLikes: {}'.format(likes))
		f.write('\nComments: {}'.format(comments))
		
		if info[-1] == 'm':
			mediaDict = info[6]
			i = 1
			for mediaUrl, mediaInfo in mediaDict.items():
				if str(mediaInfo).isdigit():
					f.write('\n{}. Video view count: {}'.format(str(i), str(mediaInfo)))
					f.write('\n{}. Video url: {}'.format(str(i), mediaUrl))
				else:
					f.write('\n{}. Picture description: {}'.format(str(i), mediaInfo))
					f.write('\n{}. Picture url: {}'.format(str(i), mediaUrl))
				i += 1
		elif info[-1] == 'v':
			vidUrl = info[6]
			vidViewCount = info[7]
			f.write('\nVideo view count: {}'.format(vidViewCount))
			f.write('\nVideo url: {}'.format(vidUrl))
		elif info[-1] == 'p':
			picUrl = info[6]
			picDescription = info[7]
			f.write('\nPicture description: {}'.format(picDescription))
			f.write('\nPicture url: {}'.format(picUrl))

self.DownloadFile(info)如下:

def DownloadFile(self, info):
	uploadTime = info[0]
	username = info[1]
	folder = '{}\\{}\\{}'.format(outputPath, username, uploadTime.replace('-', '').replace(':', '').replace(' ', ''))
	
	if info[-1] == 'm':
		mediaDict = info[6]
		i = 1
		for mediaUrl, mediaInfo in mediaDict.items():
			if str(mediaInfo).isdigit():
				os.system('{} --output-document={}\\{}.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, mediaUrl))
			else:
				os.system('{} --output-document={}\\{}.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, mediaUrl))
			i += 1
	elif info[-1] == 'v':
		vidUrl = info[6]
		os.system('{} --output-document={}\\1.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, vidUrl))
	elif info[-1] == 'p':
		picUrl = info[6]
		os.system('{} --output-document={}\\1.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, picUrl))

解释:参考这篇博文


把上述模块如下整合到类中:

class POST:
	
	def __init__(self, url):
		self.url = url
	
	def GetInfo(self):		
		for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
			try:
				jsonText = e.get_attribute('textContent')
				if 'viewerId' in jsonText:
					jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['PostPage'][0]['graphql']['shortcode_media']
					break
			except:
				continue
		
		uploadTimeStamp = jsonData['taken_at_timestamp']
		uploadTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(uploadTimeStamp))
		username = jsonData['owner']['username']
		fullName = jsonData['owner']['full_name']
		likes = jsonData['edge_media_preview_like']['count']
		comments = jsonData['edge_media_preview_comment']['count']
		try:
			text = jsonData['edge_media_to_caption']['edges'][0]['node']['text']
		except:
			text = ''
		
		try:
			mediaList = []
			for obj in jsonData['edge_sidecar_to_children']['edges']:
				try:
					vidUrl = obj['node']['video_url']
					vidViewCount = obj['node']['video_view_count']
					mediaList.append((vidUrl, vidViewCount))
				except:
					picUrl = obj['node']['display_url']
					picDescription = obj['node']['accessibility_caption']
					mediaList.append((picUrl, picDescription))
			return uploadTime, username, fullName, likes, comments, text, mediaList, 'm'
		except:
			try:
				vidUrl = jsonData['video_url']
				vidViewCount = jsonData['video_view_count']
				return uploadTime, username, fullName, likes, comments, text, vidUrl, vidViewCount, 'v'
			except:
				picUrl = jsonData['display_url']
				picDescription = jsonData['accessibility_caption']
				return uploadTime, username, fullName, likes, comments, text, picUrl, picDescription, 'p'
	
	def DownloadInfo(self, info):
		now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(time.time())))
		uploadTime = info[0]
		username = info[1]
		fullName = info[2]
		likes = info[3]
		comments = info[4]
		text = info[5]
		folder = '{}\\{}\\{}'.format(outputPath, username, ''.join([x for x in uploadTime if x.isdigit()]))
		
		try:
			os.makedirs(folder)
		except Exception as e:
			print(e)
		
		with open('{}\\info.txt'.format(folder), 'w', encoding='utf-8') as f:
			f.write('Now: {}'.format(now))
			f.write('\nUpload time: {}'.format(uploadTime))
			f.write('\nUsername: {}'.format(username))
			f.write('\nFull name: {}'.format(fullName))
			f.write('\nText: {}'.format(text))
			f.write('\nLikes: {}'.format(likes))
			f.write('\nComments: {}'.format(comments))
			
			if info[-1] == 'm':
				mediaList = info[6]
				i = 1
				for mediaTuple in mediaList:
					if str(mediaTuple[1]).isdigit():
						vidUrl = mediaTuple[0]; vidViewCount = str(mediaTuple[1])
						f.write('\n{}. Video view count: {} Video url: {}'.format(str(i), vidViewCount, vidUrl))
					else:
						picUrl = mediaTuple[0]; picDescription = mediaTuple[1]
						f.write('\n{}. Picture description: {} Picture url: {}'.format(str(i), picDescription, picUrl))
					i += 1
			elif info[-1] == 'v':
				vidUrl = info[6]
				vidViewCount = info[7]
				f.write('\nVideo view count: {}'.format(vidViewCount))
				f.write('\nVideo url: {}'.format(vidUrl))
			elif info[-1] == 'p':
				picUrl = info[6]
				picDescription = info[7]
				f.write('\nPicture description: {}'.format(picDescription))
				f.write('\nPicture url: {}'.format(picUrl))
	
	def DownloadFile(self, info):
		uploadTime = info[0]
		username = info[1]
		folder = '{}\\{}\\{}'.format(outputPath, username, ''.join([x for x in uploadTime if x.isdigit()]))
		
		try:
			os.makedirs(folder)
		except Exception as e:
			print(e)
		
		if info[-1] == 'm':
			mediaList = info[6]
			i = 1
			for mediaTuple in mediaList:
				if str(mediaTuple[1]).isdigit():
					vidUrl = mediaTuple[0]
					os.system('{} --output-document={}\\{}.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, vidUrl))
				else:
					picUrl = mediaTuple[0]
					os.system('{} --output-document={}\\{}.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, picUrl))
				i += 1
		elif info[-1] == 'v':
			vidUrl = info[6]
			os.system('{} --output-document={}\\1.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, vidUrl))
		elif info[-1] == 'p':
			picUrl = info[6]
			os.system('{} --output-document={}\\1.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, picUrl))
	
	def Main(self):
		try:
			fxDriver.get(self.url)
			info = self.GetInfo()
			self.DownloadInfo(info)
			self.DownloadFile(info)
		except Exception as e:
			print(e)

解释:可以通过执行POST(url).Main()下载每一个帖子的信息、图片和视频。

4. 完整代码

from selenium import webdriver
import json, time, os

sslPort = 
fxBinaryPath = ''
geckodriverPath = ''
pageDownJS = 'document.documentElement.scrollTop = 100000000'
outputPath = ''
wgetPath = ''
httpsProxy = 'https://127.0.0.1:{}/'.format(str(sslPort))

fxProfile = webdriver.firefox.firefox_profile.FirefoxProfile()
fxProfile.set_preference('network.proxy.type', 1)
fxProfile.set_preference('network.proxy.ssl', '127.0.0.1')
fxProfile.set_preference('network.proxy.ssl_port', sslPort)
fxProfile.set_preference('network.proxy.socks_remote_dns', True)
fxProfile.set_preference('network.trr.mode', 2)
fxProfile.set_preference('permissions.default.image', 2)
fxProfile.set_preference('intl.accept_languages', 'zh-CN, zh, zh-TW, zh-HK, en-US, en')
fxDriver = webdriver.firefox.webdriver.WebDriver(firefox_profile=fxProfile, firefox_binary=fxBinaryPath, executable_path=geckodriverPath)

class POST:
	
	def __init__(self, url):
		self.url = url
	
	def GetInfo(self):		
		for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
			try:
				jsonText = e.get_attribute('textContent')
				if 'viewerId' in jsonText:
					jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['PostPage'][0]['graphql']['shortcode_media']
					break
			except:
				continue
		
		uploadTimeStamp = jsonData['taken_at_timestamp']
		uploadTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(uploadTimeStamp))
		username = jsonData['owner']['username']
		fullName = jsonData['owner']['full_name']
		likes = jsonData['edge_media_preview_like']['count']
		comments = jsonData['edge_media_preview_comment']['count']
		try:
			text = jsonData['edge_media_to_caption']['edges'][0]['node']['text']
		except:
			text = ''
		
		try:
			mediaList = []
			for obj in jsonData['edge_sidecar_to_children']['edges']:
				try:
					vidUrl = obj['node']['video_url']
					vidViewCount = obj['node']['video_view_count']
					mediaList.append((vidUrl, vidViewCount))
				except:
					picUrl = obj['node']['display_url']
					picDescription = obj['node']['accessibility_caption']
					mediaList.append((picUrl, picDescription))
			return uploadTime, username, fullName, likes, comments, text, mediaList, 'm'
		except:
			try:
				vidUrl = jsonData['video_url']
				vidViewCount = jsonData['video_view_count']
				return uploadTime, username, fullName, likes, comments, text, vidUrl, vidViewCount, 'v'
			except:
				picUrl = jsonData['display_url']
				picDescription = jsonData['accessibility_caption']
				return uploadTime, username, fullName, likes, comments, text, picUrl, picDescription, 'p'
	
	def DownloadInfo(self, info):
		now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(time.time())))
		uploadTime = info[0]
		username = info[1]
		fullName = info[2]
		likes = info[3]
		comments = info[4]
		text = info[5]
		folder = '{}\\{}\\{}'.format(outputPath, username, ''.join([x for x in uploadTime if x.isdigit()]))
		
		try:
			os.makedirs(folder)
		except Exception as e:
			print(e)
		
		with open('{}\\info.txt'.format(folder), 'w', encoding='utf-8') as f:
			f.write('Now: {}'.format(now))
			f.write('\nUpload time: {}'.format(uploadTime))
			f.write('\nUsername: {}'.format(username))
			f.write('\nFull name: {}'.format(fullName))
			f.write('\nText: {}'.format(text))
			f.write('\nLikes: {}'.format(likes))
			f.write('\nComments: {}'.format(comments))
			
			if info[-1] == 'm':
				mediaList = info[6]
				i = 1
				for mediaTuple in mediaList:
					if str(mediaTuple[1]).isdigit():
						vidUrl = mediaTuple[0]; vidViewCount = str(mediaTuple[1])
						f.write('\n{}. Video view count: {} Video url: {}'.format(str(i), vidViewCount, vidUrl))
					else:
						picUrl = mediaTuple[0]; picDescription = mediaTuple[1]
						f.write('\n{}. Picture description: {} Picture url: {}'.format(str(i), picDescription, picUrl))
					i += 1
			elif info[-1] == 'v':
				vidUrl = info[6]
				vidViewCount = info[7]
				f.write('\nVideo view count: {}'.format(vidViewCount))
				f.write('\nVideo url: {}'.format(vidUrl))
			elif info[-1] == 'p':
				picUrl = info[6]
				picDescription = info[7]
				f.write('\nPicture description: {}'.format(picDescription))
				f.write('\nPicture url: {}'.format(picUrl))
	
	def DownloadFile(self, info):
		uploadTime = info[0]
		username = info[1]
		folder = '{}\\{}\\{}'.format(outputPath, username, ''.join([x for x in uploadTime if x.isdigit()]))
		
		try:
			os.makedirs(folder)
		except Exception as e:
			print(e)
		
		if info[-1] == 'm':
			mediaList = info[6]
			i = 1
			for mediaTuple in mediaList:
				if str(mediaTuple[1]).isdigit():
					vidUrl = mediaTuple[0]
					os.system('{} --output-document={}\\{}.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, vidUrl))
				else:
					picUrl = mediaTuple[0]
					os.system('{} --output-document={}\\{}.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, picUrl))
				i += 1
		elif info[-1] == 'v':
			vidUrl = info[6]
			os.system('{} --output-document={}\\1.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, vidUrl))
		elif info[-1] == 'p':
			picUrl = info[6]
			os.system('{} --output-document={}\\1.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, picUrl))
	
	def Main(self):
		try:
			fxDriver.get(self.url)
			info = self.GetInfo()
			self.DownloadInfo(info)
			self.DownloadFile(info)
		except Exception as e:
			print(e)

class PROFILE:
	
	def __init__(self, profileUrl):
		self.profileUrl = profileUrl
	
	def Update(self):
		for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
			try:
				jsonText = e.get_attribute('textContent')
				if 'viewerId' in jsonText:
					jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['ProfilePage'][0]['graphql']['user']
					break
			except:
				continue
		
		postCount = jsonData['edge_owner_to_timeline_media']['count']
		username = jsonData['username']
		folder = '{}\\{}'.format(outputPath, username)
		
		if os.path.exists(folder):
			downloadCount = len([x for x in os.listdir(folder) if os.path.isdir('{}\\{}'.format(folder, x))])
		else:
			downloadCount = 0
		
		updateCount = postCount - downloadCount
		
		return updateCount
	
	def GetLocY(self):
		urlDict = {}
		
		for e in fxDriver.find_elements_by_xpath('//a[contains(@href, "/p/")]'):
			locY = e.location['y']
			locX = e.location['x']
			url = e.get_attribute('href')
			urlDict[url] = locX/1000 + locY
		
		return locY, urlDict
	
	def JudgeLoading(self, locY, urlDict):
		time.sleep(0.5)		
		locYNew, urlDictNew = self.GetLocY()
		
		if locYNew > locY:
			urlDictNew.update(urlDict)
		else:
			locYNew = None
		
		return locYNew, urlDictNew
	
	def GetWholePage(self):
		updateCount = self.Update()
		fxDriver.execute_script(pageDownJS)
		
		try:
			fxDriver.find_element_by_xpath('//div[contains(text(), "更多帖子")]').click()
		except Exception as e:
			print(e)
		
		locY, urlDict = self.GetLocY()
		
		while 1:
			fxDriver.execute_script(pageDownJS)						
			while 1:
				locYNew, urlDictNew = self.JudgeLoading(locY, urlDict)				
				urlList = [t[0] for t in sorted(urlDictNew.items(), key = lambda x:x[1])]
				
				if len(urlList) >= updateCount:
					return urlList[: updateCount]
				
				if locYNew == None:
					continue
				else:
					locY = locYNew
					urlDict = urlDictNew
					break
	
	def Main(self):
		try:
			fxDriver.get(self.profileUrl)
			urlList = self.GetWholePage()
			return urlList
		except Exception as e:
			print(e)

def Main():
	inputUrl = input('Please input the instagram link: ')
	
	if '/p/' in inputUrl:
		POST(inputUrl).Main()
	else:
		if not 'www.instagram.com' in inputUrl:
			inputUrl = 'https://www.instagram.com/{}/'.format(inputUrl)
		
		urlList = PROFILE(inputUrl).Main()
		
		if urlList:
			for url in urlList:
				POST(url).Main()
	
	Main()

if __name__ == '__main__':
	Main()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章