【Python】爬取并下载Instagram账户中所有帖子的信息、图片和视频

目录
0. 项目介绍
1. 构造浏览器实例
2. 登录Instagram账户
3. 爬取账户页中所有帖子的链接
4. 爬取并下载帖子页中的信息、图片和视频
5. 完整代码

0. 项目介绍

本项目的目的是输入指定Instagram账户页的链接,输出该账户每一个帖子的:① 包含当前时间、用户上传帖子的时间(当前时区)、用户名称(Username)、用户全称(Full name)、帖子文字、点赞数、评论数、图片描述(当帖子中有图片时)、图片链接(当帖子中有图片时)、视频观看数(当帖子中有视频时)、视频链接(当帖子中有视频时)的文本文档;② 图片(当帖子中有图片时)、视频(当帖子中有视频时)。


本项目需要先导入如下库:

from selenium import webdriver
from bs4 import BeautifulSoup
import json, time, os

本项目的基本结构如下:

def Main():
	profileUrl = input('Please input the instagram profile link: ')
	
	urlList = PROFILE().Main(profileUrl)
	
	for url in urlList:
		POST().Main(url)

解释:在构造浏览器实例和登录Instagram账户之后输入Instagram账户页的链接profileUrlPROFILE().Main(profileUrl)负责爬取账户页中所有帖子的链接,生成链接列表urlListPOST().Main(url)负责下载每一个帖子的信息、图片和视频。

1. 构造浏览器实例

from selenium import webdriver

socksPort = 
httpPort = 
sslPort = 

fxProfile = webdriver.firefox.firefox_profile.FirefoxProfile()
fxProfile.set_preference('network.proxy.type', 1)
fxProfile.set_preference('network.proxy.socks', '127.0.0.1')
fxProfile.set_preference('network.proxy.socks_port', socksPort)
fxProfile.set_preference('network.proxy.http', '127.0.0.1')
fxProfile.set_preference('network.proxy.http_port', httpPort)
fxProfile.set_preference('network.proxy.ssl', '127.0.0.1')
fxProfile.set_preference('network.proxy.ssl_port', sslPort)
fxProfile.set_preference('network.proxy.socks_remote_dns', True)
fxProfile.set_preference('network.trr.mode', 2)
fxProfile.set_preference('permissions.default.image', 2)
fxProfile.set_preference('intl.accept_languages', 'zh-CN, zh, zh-TW, zh-HK, en-US, en')
fxBinaryPath = ''
geckodriverPath = ''
fxDriver = webdriver.firefox.webdriver.WebDriver(firefox_profile=fxProfile, firefox_binary=fxBinaryPath, executable_path=geckodriverPath)

解释:可以参考这篇博文。应根据手头上的工具分别设置SOCKS、HTTP和SSL代理的端口socksPorthttpPortsslPortfxBinaryPath是Firefox浏览器的firefox.exe的绝对路径,geckodriverPathgeckodriver.exe的绝对路径。

2. 登录Instagram账户

account = ''
password = ''
fxDriver.get('https://www.instagram.com/accounts/login/')
webdriver.support.ui.WebDriverWait(fxDriver, 100).until(lambda x: x.find_element_by_name('username'))
fxDriver.find_element_by_name('username').send_keys(account)
fxDriver.find_element_by_name('password').send_keys(password)
fxDriver.find_element_by_xpath('//button[@type="submit"]').click()

解释:account是你的Instagram用户名,password是你的Instagram密码。

3. 爬取账户页中所有帖子的链接

这一步的基本结构如下:

def Main(self, profileUrl):
	try:
		fxDriver.get(profileUrl)
		urlList = PROFILE().GetWholePage()
		return urlList
	except Exception as e:
		print(e)

解释:浏览器先访问账户页,然后PROFILE().GetWholePage()负责爬取账户页中所有帖子的链接,生成链接列表urlList


PROFILE().GetWholePage()如下:

def GetWholePage(self):
	locY, urlList = PROFILE().GetLocY()
	loadFailCount = 0
	
	while 1:
		pageDownJS = 'document.documentElement.scrollTop=100000000'
		fxDriver.execute_script(pageDownJS)
		
		while 1:
			locYNew, urlListNew = PROFILE().JudgeLoading(locY, urlList)
			
			if locYNew == None:
				loadFailCount += 1
				if loadFailCount > 20:
					return urlList
			else:
				loadFailCount = 0
				locY = locYNew
				urlList = urlListNew
				break

解释:

  1. PROFILE().GetLocY()可以获得账户页HTML中每个帖子链接所在tag的Y座标locY和当前加载的所有帖子的链接列表urlList
  2. fxDriver.execute_script(pageDownJS)可以通过执行JS代码document.documentElement.scrollTop=100000000把页面拉到最下面。
  3. PROFILE().JudgeLoading(locY, urlList)可以对比输入的Y座标和0.5秒之后的Y座标来判断fxDriver.execute_script(pageDownJS)有没有执行完毕。如果没有执行完毕则返回None,如果执行完毕则返回新的Y座标locYNew和新的链接列表urlListNew

PROFILE().GetLocY()如下:

def GetLocY(self):
	urlList = []
	
	for e in fxDriver.find_elements_by_tag_name('a'):
		try:
			url = e.get_attribute('href')
			if '/p/' in url:
				locY = e.location['y']
				urlList.append(url)
		except:
			continue
	
	return locY, urlList

解释:通过循环判断'/p/'有没有在a标签的'href'属性中来获得帖子链接及其所在tag的Y座标。


PROFILE().JudgeLoading(locY, urlList)如下:

def JudgeLoading(self, locY, urlList):
	time.sleep(0.5)
	
	locYNew, urlListNew = PROFILE().GetLocY()
	
	if locY < locYNew:
		urlListNew += urlList
		urlListNew = list(set(urlListNew))
		return locYNew, urlListNew
	else:
		return None, None

把上述模块如下整合到类中:

class PROFILE(object):
	
	def GetLocY(self):
		urlList = []
		
		for e in fxDriver.find_elements_by_tag_name('a'):
			try:
				url = e.get_attribute('href')
				if '/p/' in url:
					locY = e.location['y']
					urlList.append(url)
			except:
				continue
		
		return locY, urlList
	
	def JudgeLoading(self, locY, urlList):
		time.sleep(0.5)
		
		locYNew, urlListNew = PROFILE().GetLocY()
		
		if locY < locYNew:
			urlListNew += urlList
			urlListNew = list(set(urlListNew))
			return locYNew, urlListNew
		else:
			return None, None
	
	def GetWholePage(self):
		locY, urlList = PROFILE().GetLocY()
		loadFailCount = 0
		
		while 1:
			pageDownJS = 'document.documentElement.scrollTop=100000000'
			fxDriver.execute_script(pageDownJS)
			
			while 1:
				locYNew, urlListNew = PROFILE().JudgeLoading(locY, urlList)
				
				if locYNew == None:
					loadFailCount += 1
					if loadFailCount > 20:
						return urlList
				else:
					loadFailCount = 0
					locY = locYNew
					urlList = urlListNew
					break
	
	def Main(self, profileUrl):
		try:
			fxDriver.get(profileUrl)
			urlList = PROFILE().GetWholePage()
			return urlList
		except Exception as e:
			print(e)

解释:可以通过调用PROFILE().Main(profileUrl)获得账户页中所有帖子的链接。

4. 爬取并下载帖子页中的信息、图片和视频

这一步的基本结构如下:

def Main(self, url):
	try:
		fxDriver.get(url)
		html = fxDriver.page_source
		info = POST().GetInfo(html)
		POST().DownloadInfo(info)
		POST().DownloadFile(info)
	except Exception as e:
		print(e)

解释:浏览器先访问帖子页;然后通过fxDriver.page_source获取帖子页的HTML;POST().GetInfo(html)可以通过分析HTML获得用户上传帖子的时间(当前时区)、用户名称(Username)、用户全称(Full name)、帖子文字、点赞数、评论数、图片描述(当帖子中有图片时)、图片链接(当帖子中有图片时)、视频观看数(当帖子中有视频时)、视频链接(当帖子中有视频时)等信息;POST().DownloadInfo(info)把信息写入文本文档;POST().DownloadFile(info)根据获取的信息下载帖子页中的图片和视频。


POST().GetInfo(html)如下:

def GetInfo(self, html):
	soup = BeautifulSoup(html, 'html.parser')
	for s in soup.find_all('script', {'type':'text/javascript'}):
		if s.string is not None and 'graphql' in s.string:
			jsonData = json.loads(s.string[s.string.find('{'): s.string.rfind('}') + 1])
			break
	
	uploadTimeStamp = jsonData['graphql']['shortcode_media']['taken_at_timestamp']
	uploadTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(uploadTimeStamp))
	username = jsonData['graphql']['shortcode_media']['owner']['username']
	fullName = jsonData['graphql']['shortcode_media']['owner']['full_name']
	likes = jsonData['graphql']['shortcode_media']['edge_media_preview_like']['count']
	comments = jsonData['graphql']['shortcode_media']['edge_media_preview_comment']['count']
	try:
		text = jsonData['graphql']['shortcode_media']['edge_media_to_caption']['edges'][0]['node']['text']
	except:
		text = 'None'
	
	try:
		displayDict = {}
		for obj in jsonData['graphql']['shortcode_media']['edge_sidecar_to_children']['edges']:
			displayUrl = obj['node']['display_url']
			picDescription = obj['node']['accessibility_caption']
			displayDict[displayUrl] = picDescription
		return uploadTime, username, fullName, likes, comments, text, displayDict, 'ps'
	except:
		try:
			videoUrl = jsonData['graphql']['shortcode_media']['video_url']
			videoViewCount = jsonData['graphql']['shortcode_media']['video_view_count']
			return uploadTime, username, fullName, likes, comments, text, videoUrl, videoViewCount, 'v'
		except:
			displayUrl = jsonData['graphql']['shortcode_media']['display_url']
			picDescription = jsonData['graphql']['shortcode_media']['accessibility_caption']
			return uploadTime, username, fullName, likes, comments, text, displayUrl, picDescription, 'p'

解释:帖子中我们需要的所有信息都在jsonData中。我们通过判断'graphql'是否在'type'属性为'text/javascript'script标签中来获取jsonData


POST().DownloadInfo(info)如下:

def DownloadInfo(self, info):
	now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(time.time())))
	uploadTime = info[0]
	username = info[1]
	fullName = info[2]
	likes = info[3]
	comments = info[4]
	text = info[5]
	folder = '{}\\{}\\{}'.format(outputPath, username, uploadTime.replace('-', '').replace(':', '').replace(' ', ''))
	
	try:
		os.makedirs(folder)
	except Exception as e:
		print(e)
	
	with open('{}\\info.txt'.format(folder), 'w', encoding='utf-8') as f:
		f.write('Now: {}'.format(now))
		f.write('\nUpload time: {}'.format(uploadTime))
		f.write('\nUsername: {}'.format(username))
		f.write('\nFull name: {}'.format(fullName))
		f.write('\nText: {}'.format(text))
		f.write('\nLikes: {}'.format(likes))
		f.write('\nComments: {}'.format(comments))
		
		if info[-1] == 'ps':
			displayDict = info[6]
			picIdx = 1
			for displayUrl, picDescription in displayDict.items():
				f.write('\nPicture {} description: {}'.format(str(picIdx), picDescription))
				f.write('\nPicture {} url: {}'.format(str(picIdx), displayUrl))
				picIdx += 1
		elif info[-1] == 'v':
			videoUrl = info[6]
			videoViewCount = info[7]
			f.write('\nVideo view count: {}'.format(videoViewCount))
			f.write('\nVideo url: {}'.format(videoUrl))
		elif info[-1] == 'p':
			displayUrl = info[6]
			picDescription = info[7]
			f.write('\nPicture description: {}'.format(picDescription))
			f.write('\nPicture url: {}'.format(displayUrl))

解释:outputPath是全局变量,表示输入文件夹的绝对路径。


POST().DownloadFile(info)如下:

def DownloadFile(self, info):
	uploadTime = info[0]
	username = info[1]
	folder = '{}\\{}\\{}'.format(outputPath, username, uploadTime.replace('-', '').replace(':', '').replace(' ', ''))
	
	if info[-1] == 'ps':
		displayDict = info[6]
		i = 1
		for displayUrl in displayDict.keys():
			os.system('{} --output-document={}\\{}.png --no-check-certificate --execute http_proxy={} --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpProxy, httpsProxy, displayUrl))
			i += 1
	elif info[-1] == 'v':
		videoUrl = info[6]
		os.system('{} --output-document={}\\1.mp4 --no-check-certificate --execute http_proxy={} --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpProxy, httpsProxy, videoUrl))
	elif info[-1] == 'p':
		displayUrl = info[6]
		os.system('{} --output-document={}\\1.png --no-check-certificate --execute http_proxy={} --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpProxy, httpsProxy, displayUrl))

解释:可以参考这篇博文wgetPath是全局变量,表示wget.exe的绝对路径;httpProxy是全局变量,表示HTTP代理;httpsProxy是全局变量,表示HTTPS代理。


把上述模块如下整合到类中:

class POST(object):
	
	def GetInfo(self, html):
		soup = BeautifulSoup(html, 'html.parser')
		for s in soup.find_all('script', {'type':'text/javascript'}):
			if s.string is not None and 'graphql' in s.string:
				jsonData = json.loads(s.string[s.string.find('{'): s.string.rfind('}') + 1])
				break
		
		uploadTimeStamp = jsonData['graphql']['shortcode_media']['taken_at_timestamp']
		uploadTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(uploadTimeStamp))
		username = jsonData['graphql']['shortcode_media']['owner']['username']
		fullName = jsonData['graphql']['shortcode_media']['owner']['full_name']
		likes = jsonData['graphql']['shortcode_media']['edge_media_preview_like']['count']
		comments = jsonData['graphql']['shortcode_media']['edge_media_preview_comment']['count']
		try:
			text = jsonData['graphql']['shortcode_media']['edge_media_to_caption']['edges'][0]['node']['text']
		except:
			text = 'None'
		
		try:
			displayDict = {}
			for obj in jsonData['graphql']['shortcode_media']['edge_sidecar_to_children']['edges']:
				displayUrl = obj['node']['display_url']
				picDescription = obj['node']['accessibility_caption']
				displayDict[displayUrl] = picDescription
			return uploadTime, username, fullName, likes, comments, text, displayDict, 'ps'
		except:
			try:
				videoUrl = jsonData['graphql']['shortcode_media']['video_url']
				videoViewCount = jsonData['graphql']['shortcode_media']['video_view_count']
				return uploadTime, username, fullName, likes, comments, text, videoUrl, videoViewCount, 'v'
			except:
				displayUrl = jsonData['graphql']['shortcode_media']['display_url']
				picDescription = jsonData['graphql']['shortcode_media']['accessibility_caption']
				return uploadTime, username, fullName, likes, comments, text, displayUrl, picDescription, 'p'
	
	def DownloadInfo(self, info):
		now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(time.time())))
		uploadTime = info[0]
		username = info[1]
		fullName = info[2]
		likes = info[3]
		comments = info[4]
		text = info[5]
		folder = '{}\\{}\\{}'.format(outputPath, username, uploadTime.replace('-', '').replace(':', '').replace(' ', ''))
		
		try:
			os.makedirs(folder)
		except Exception as e:
			print(e)
		
		with open('{}\\info.txt'.format(folder), 'w', encoding='utf-8') as f:
			f.write('Now: {}'.format(now))
			f.write('\nUpload time: {}'.format(uploadTime))
			f.write('\nUsername: {}'.format(username))
			f.write('\nFull name: {}'.format(fullName))
			f.write('\nText: {}'.format(text))
			f.write('\nLikes: {}'.format(likes))
			f.write('\nComments: {}'.format(comments))
			
			if info[-1] == 'ps':
				displayDict = info[6]
				picIdx = 1
				for displayUrl, picDescription in displayDict.items():
					f.write('\nPicture {} description: {}'.format(str(picIdx), picDescription))
					f.write('\nPicture {} url: {}'.format(str(picIdx), displayUrl))
					picIdx += 1
			elif info[-1] == 'v':
				videoUrl = info[6]
				videoViewCount = info[7]
				f.write('\nVideo view count: {}'.format(videoViewCount))
				f.write('\nVideo url: {}'.format(videoUrl))
			elif info[-1] == 'p':
				displayUrl = info[6]
				picDescription = info[7]
				f.write('\nPicture description: {}'.format(picDescription))
				f.write('\nPicture url: {}'.format(displayUrl))
	
	def DownloadFile(self, info):
		uploadTime = info[0]
		username = info[1]
		folder = '{}\\{}\\{}'.format(outputPath, username, uploadTime.replace('-', '').replace(':', '').replace(' ', ''))
		
		if info[-1] == 'ps':
			displayDict = info[6]
			i = 1
			for displayUrl in displayDict.keys():
				os.system('{} --output-document={}\\{}.png --no-check-certificate --execute http_proxy={} --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpProxy, httpsProxy, displayUrl))
				i += 1
		elif info[-1] == 'v':
			videoUrl = info[6]
			os.system('{} --output-document={}\\1.mp4 --no-check-certificate --execute http_proxy={} --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpProxy, httpsProxy, videoUrl))
		elif info[-1] == 'p':
			displayUrl = info[6]
			os.system('{} --output-document={}\\1.png --no-check-certificate --execute http_proxy={} --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpProxy, httpsProxy, displayUrl))
	
	def Main(self, url):
		try:
			fxDriver.get(url)
			html = fxDriver.page_source
			info = POST().GetInfo(html)
			POST().DownloadInfo(info)
			POST().DownloadFile(info)
		except Exception as e:
			print(e)

解释:可以通过调用POST().Main(url)下载每一个帖子的信息、图片和视频。

5. 完整代码

from selenium import webdriver
from bs4 import BeautifulSoup
import json, time, os

socksPort = 
httpPort = 
sslPort = 

fxProfile = webdriver.firefox.firefox_profile.FirefoxProfile()
fxProfile.set_preference('network.proxy.type', 1)
fxProfile.set_preference('network.proxy.socks', '127.0.0.1')
fxProfile.set_preference('network.proxy.socks_port', socksPort)
fxProfile.set_preference('network.proxy.http', '127.0.0.1')
fxProfile.set_preference('network.proxy.http_port', httpPort)
fxProfile.set_preference('network.proxy.ssl', '127.0.0.1')
fxProfile.set_preference('network.proxy.ssl_port', sslPort)
fxProfile.set_preference('network.proxy.socks_remote_dns', True)
fxProfile.set_preference('network.trr.mode', 2)
fxProfile.set_preference('permissions.default.image', 2)
fxProfile.set_preference('intl.accept_languages', 'zh-CN, zh, zh-TW, zh-HK, en-US, en')
fxBinaryPath = ''
geckodriverPath = ''
fxDriver = webdriver.firefox.webdriver.WebDriver(firefox_profile=fxProfile, firefox_binary=fxBinaryPath, executable_path=geckodriverPath)

account = ''
password = ''
fxDriver.get('https://www.instagram.com/accounts/login/')
webdriver.support.ui.WebDriverWait(fxDriver, 100).until(lambda x: x.find_element_by_name('username'))
fxDriver.find_element_by_name('username').send_keys(account)
fxDriver.find_element_by_name('password').send_keys(password)
fxDriver.find_element_by_xpath('//button[@type="submit"]').click()

outputPath = ''
wgetPath = ''
httpProxy = 'http://127.0.0.1:{}/'.format(str(httpPort))
httpsProxy = 'https://127.0.0.1:{}/'.format(str(sslPort))

class POST(object):
	
	def GetInfo(self, html):
		soup = BeautifulSoup(html, 'html.parser')
		for s in soup.find_all('script', {'type':'text/javascript'}):
			if s.string is not None and 'graphql' in s.string:
				jsonData = json.loads(s.string[s.string.find('{'): s.string.rfind('}') + 1])
				break
		
		uploadTimeStamp = jsonData['graphql']['shortcode_media']['taken_at_timestamp']
		uploadTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(uploadTimeStamp))
		username = jsonData['graphql']['shortcode_media']['owner']['username']
		fullName = jsonData['graphql']['shortcode_media']['owner']['full_name']
		likes = jsonData['graphql']['shortcode_media']['edge_media_preview_like']['count']
		comments = jsonData['graphql']['shortcode_media']['edge_media_preview_comment']['count']
		try:
			text = jsonData['graphql']['shortcode_media']['edge_media_to_caption']['edges'][0]['node']['text']
		except:
			text = 'None'
		
		try:
			displayDict = {}
			for obj in jsonData['graphql']['shortcode_media']['edge_sidecar_to_children']['edges']:
				displayUrl = obj['node']['display_url']
				picDescription = obj['node']['accessibility_caption']
				displayDict[displayUrl] = picDescription
			return uploadTime, username, fullName, likes, comments, text, displayDict, 'ps'
		except:
			try:
				videoUrl = jsonData['graphql']['shortcode_media']['video_url']
				videoViewCount = jsonData['graphql']['shortcode_media']['video_view_count']
				return uploadTime, username, fullName, likes, comments, text, videoUrl, videoViewCount, 'v'
			except:
				displayUrl = jsonData['graphql']['shortcode_media']['display_url']
				picDescription = jsonData['graphql']['shortcode_media']['accessibility_caption']
				return uploadTime, username, fullName, likes, comments, text, displayUrl, picDescription, 'p'
	
	def DownloadInfo(self, info):
		now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(time.time())))
		uploadTime = info[0]
		username = info[1]
		fullName = info[2]
		likes = info[3]
		comments = info[4]
		text = info[5]
		folder = '{}\\{}\\{}'.format(outputPath, username, uploadTime.replace('-', '').replace(':', '').replace(' ', ''))
		
		try:
			os.makedirs(folder)
		except Exception as e:
			print(e)
		
		with open('{}\\info.txt'.format(folder), 'w', encoding='utf-8') as f:
			f.write('Now: {}'.format(now))
			f.write('\nUpload time: {}'.format(uploadTime))
			f.write('\nUsername: {}'.format(username))
			f.write('\nFull name: {}'.format(fullName))
			f.write('\nText: {}'.format(text))
			f.write('\nLikes: {}'.format(likes))
			f.write('\nComments: {}'.format(comments))
			
			if info[-1] == 'ps':
				displayDict = info[6]
				picIdx = 1
				for displayUrl, picDescription in displayDict.items():
					f.write('\nPicture {} description: {}'.format(str(picIdx), picDescription))
					f.write('\nPicture {} url: {}'.format(str(picIdx), displayUrl))
					picIdx += 1
			elif info[-1] == 'v':
				videoUrl = info[6]
				videoViewCount = info[7]
				f.write('\nVideo view count: {}'.format(videoViewCount))
				f.write('\nVideo url: {}'.format(videoUrl))
			elif info[-1] == 'p':
				displayUrl = info[6]
				picDescription = info[7]
				f.write('\nPicture description: {}'.format(picDescription))
				f.write('\nPicture url: {}'.format(displayUrl))
	
	def DownloadFile(self, info):
		uploadTime = info[0]
		username = info[1]
		folder = '{}\\{}\\{}'.format(outputPath, username, uploadTime.replace('-', '').replace(':', '').replace(' ', ''))
		
		if info[-1] == 'ps':
			displayDict = info[6]
			i = 1
			for displayUrl in displayDict.keys():
				os.system('{} --output-document={}\\{}.png --no-check-certificate --execute http_proxy={} --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpProxy, httpsProxy, displayUrl))
				i += 1
		elif info[-1] == 'v':
			videoUrl = info[6]
			os.system('{} --output-document={}\\1.mp4 --no-check-certificate --execute http_proxy={} --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpProxy, httpsProxy, videoUrl))
		elif info[-1] == 'p':
			displayUrl = info[6]
			os.system('{} --output-document={}\\1.png --no-check-certificate --execute http_proxy={} --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpProxy, httpsProxy, displayUrl))
	
	def Main(self, url):
		try:
			fxDriver.get(url)
			html = fxDriver.page_source
			info = POST().GetInfo(html)
			POST().DownloadInfo(info)
			POST().DownloadFile(info)
		except Exception as e:
			print(e)

class PROFILE(object):
	
	def GetLocY(self):
		urlList = []
		
		for e in fxDriver.find_elements_by_tag_name('a'):
			try:
				url = e.get_attribute('href')
				if '/p/' in url:
					locY = e.location['y']
					urlList.append(url)
			except:
				continue
		
		return locY, urlList
	
	def JudgeLoading(self, locY, urlList):
		time.sleep(0.5)
		
		locYNew, urlListNew = PROFILE().GetLocY()
		
		if locY < locYNew:
			urlListNew += urlList
			urlListNew = list(set(urlListNew))
			return locYNew, urlListNew
		else:
			return None, None
	
	def GetWholePage(self):
		locY, urlList = PROFILE().GetLocY()
		loadFailCount = 0
		
		while 1:
			pageDownJS = 'document.documentElement.scrollTop=100000000'
			fxDriver.execute_script(pageDownJS)
			
			while 1:
				locYNew, urlListNew = PROFILE().JudgeLoading(locY, urlList)
				
				if locYNew == None:
					loadFailCount += 1
					if loadFailCount > 20:
						return urlList
				else:
					loadFailCount = 0
					locY = locYNew
					urlList = urlListNew
					break
	
	def Main(self, profileUrl):
		try:
			fxDriver.get(profileUrl)
			urlList = PROFILE().GetWholePage()
			return urlList
		except Exception as e:
			print(e)

def Main():
	profileUrl = input('Please input the instagram profile link: ')
	
	urlList = PROFILE().Main(profileUrl)
	
	for url in urlList:
		POST().Main(url)
	
	Main()

if __name__ == '__main__':
	Main()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章