目录 |
---|
0. 项目介绍 |
1. 爬取账户页中所有帖子的链接 |
2. 爬取并下载帖子页中的信息、图片和视频 |
3. 完整代码 |
0. 项目介绍
本项目的目的是输入Instagram账号或账户页链接或帖子页链接,输出该账户帖子的:① 包含当前时间、用户上传帖子的时间(当前时区)、用户名称(Username)、用户全称(Full name)、帖子文字、点赞数、评论数、图片描述(当帖子中有图片时)、图片链接(当帖子中有图片时)、视频观看数(当帖子中有视频时)、视频链接(当帖子中有视频时)的文本文档;② 图片(当帖子中有图片时)、视频(当帖子中有视频时)。
本项目需要先导入如下库:
from selenium import webdriver
from multiprocessing import Pool
import json, time, os
本项目的全局变量如下:
sslPort =
fxBinaryPath = ''
geckodriverPath = ''
pageDownJS = 'document.documentElement.scrollTop = 100000000'
outputPath = ''
ariaPath = ''
httpsProxy = 'https://127.0.0.1:{}/'.format(str(sslPort))
解释:
sslPort
:可用于访问Instagram的HTTPS代理的本地端口。fxBinaryPath
:Firefox浏览器的firefox.exe
的绝对路径。geckodriverPath
:geckodriver.exe
的绝对路径。pageDownJS
:用于下拉页面的JavaScript代码。outputPath
:输出路径。ariaPath
:aria2c.exe
的绝对路径。httpsProxy
:用于GNU Wget for Windows的HTTPS代理。
本项目的基本结构如下:
def Driver():
# Driver函数用于构造Firefox浏览器实例,输出浏览器实例
class DOWNLOAD:
# DOWNLOAD类是一个多进程下载工具
class POST:
# POST类用于爬取并下载帖子页中的信息、图片和视频
class PROFILE:
# PROFILE类用于爬取账户页中所有帖子的链接
def Main():
# Main函数是主函数,输入Instagram账号或账户页链接或帖子页链接,控制各类和函数
if __name__ == '__main__':
Main()
本项目的运行流程可见Main函数:
def Main():
fxDriver = Driver()
inputUrl = input('Please input instagram link or username: ')
if '/p/' in inputUrl:
POST(fxDriver, inputUrl).Main()
else:
if not 'www.instagram.com' in inputUrl:
inputUrl = 'https://www.instagram.com/{}/'.format(inputUrl)
urlList = PROFILE(fxDriver, inputUrl).Main()
if urlList:
l = len(urlList)
i = 0
for url in urlList:
POST(fxDriver, url).Main()
i += 1
print('\n\n{:.2f} % completed.\n\n'.format(i / l * 100))
fxDriver.quit()
Main()
1. 爬取账户页中所有帖子的链接
这一步的基本结构如下:
def Main(self):
try:
fxDriver.get(self.profileUrl)
urlList = self.GetWholePage()
return urlList
except Exception as e:
print(e)
解释:① 浏览器访问账户页。② self.GetWholePage()
负责爬取账户页中所有帖子的链接,生成链接列表urlList
。
self.GetWholePage()
如下:
def GetWholePage(self):
updateCount = self.Update()
fxDriver.execute_script(pageDownJS)
try:
fxDriver.find_element_by_xpath('//div[contains(text(), "更多帖子")]').click()
except Exception as e:
print(e)
locY, urlDict = self.GetLocY()
while 1:
fxDriver.execute_script(pageDownJS)
while 1:
locYNew, urlDictNew = self.JudgeLoading(locY, urlDict)
urlList = [t[0] for t in sorted(urlDictNew.items(), key = lambda x:x[1])]
if len(urlList) >= updateCount:
return urlList[: updateCount]
if locYNew == None:
continue
else:
locY = locYNew
urlDict = urlDictNew
break
解释:
self.Update()
用于计算需要更新的贴子数。fxDriver.execute_script(pageDownJS)
可以通过执行JS代码pageDownJS
把页面拉到最下面。self.GetLocY()
可以获得账户页HTML中每个帖子链接所在tag的Y座标locY
和当前加载的所有帖子的链接字典urlDict
。self.JudgeLoading(locY, urlDict)
可以对比输入的Y座标和0.5秒之后的Y座标来判断pageDownJS
有没有执行完毕。
self.Update()
如下:
def Update(self):
for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
try:
jsonText = e.get_attribute('textContent')
if 'viewerId' in jsonText:
jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['ProfilePage'][0]['graphql']['user']
break
except:
continue
postCount = jsonData['edge_owner_to_timeline_media']['count']
username = jsonData['username']
folder = '{}\\{}'.format(outputPath, username)
if os.path.exists(folder):
downloadCount = len([x for x in os.listdir(folder) if os.path.isdir('{}\\{}'.format(folder, x))])
else:
downloadCount = 0
updateCount = postCount - downloadCount
return updateCount
解释:① 解析网页中的贴子数。② 统计已经下载了多少帖子。③ 计算需要更新的贴子数。
self.GetLocY()
如下:
def GetLocY(self):
urlDict = {}
for e in fxDriver.find_elements_by_xpath('//a[contains(@href, "/p/")]'):
locY = e.location['y']
locX = e.location['x']
url = e.get_attribute('href')
urlDict[url] = locX/1000 + locY
return locY, urlDict
解释:通过循环判断'/p/'
有没有在a
标签的'href'
属性中来获得帖子链接及其所在tag的Y座标。
self.JudgeLoading(locY, urlDict)
如下:
def JudgeLoading(self, locY, urlDict):
time.sleep(0.5)
locYNew, urlDictNew = self.GetLocY()
if locYNew > locY:
urlDictNew.update(urlDict)
else:
locYNew = None
return locYNew, urlDictNew
把上述模块如下整合到类中:
class PROFILE:
def __init__(self, profileUrl):
self.profileUrl = profileUrl
def Update(self):
for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
try:
jsonText = e.get_attribute('textContent')
if 'viewerId' in jsonText:
jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['ProfilePage'][0]['graphql']['user']
break
except:
continue
postCount = jsonData['edge_owner_to_timeline_media']['count']
username = jsonData['username']
folder = '{}\\{}'.format(outputPath, username)
if os.path.exists(folder):
downloadCount = len([x for x in os.listdir(folder) if os.path.isdir('{}\\{}'.format(folder, x))])
else:
downloadCount = 0
updateCount = postCount - downloadCount
return updateCount
def GetLocY(self):
urlDict = {}
for e in fxDriver.find_elements_by_xpath('//a[contains(@href, "/p/")]'):
locY = e.location['y']
locX = e.location['x']
url = e.get_attribute('href')
urlDict[url] = locX/1000 + locY
return locY, urlDict
def JudgeLoading(self, locY, urlDict):
time.sleep(0.5)
locYNew, urlDictNew = self.GetLocY()
if locYNew > locY:
urlDictNew.update(urlDict)
else:
locYNew = None
return locYNew, urlDictNew
def GetWholePage(self):
updateCount = self.Update()
fxDriver.execute_script(pageDownJS)
try:
fxDriver.find_element_by_xpath('//div[contains(text(), "更多帖子")]').click()
except Exception as e:
print(e)
locY, urlDict = self.GetLocY()
while 1:
fxDriver.execute_script(pageDownJS)
while 1:
locYNew, urlDictNew = self.JudgeLoading(locY, urlDict)
urlList = [t[0] for t in sorted(urlDictNew.items(), key = lambda x:x[1])]
if len(urlList) >= updateCount:
return urlList[: updateCount]
if locYNew == None:
continue
else:
locY = locYNew
urlDict = urlDictNew
break
def Main(self):
try:
fxDriver.get(self.profileUrl)
urlList = self.GetWholePage()
return urlList
except Exception as e:
print(e)
解释:可以通过执行PROFILE(inputUrl).Main()
获得账户页中所有帖子的链接。
3. 爬取并下载帖子页中的信息、图片和视频
这一步的基本结构如下:
def Main(self):
try:
fxDriver.get(self.url)
info = self.GetInfo()
self.DownloadInfo(info)
self.DownloadFile(info)
except Exception as e:
print(e)
解释:① 浏览器访问帖子页。② self.GetInfo()
可以获得用户上传帖子的时间(当前时区)、用户名称(Username)、用户全称(Full name)、帖子文字、点赞数、评论数、图片描述(当帖子中有图片时)、图片链接(当帖子中有图片时)、视频观看数(当帖子中有视频时)、视频链接(当帖子中有视频时)等信息。③ self.DownloadInfo(info)
把信息写入文本文档。④ self.DownloadFile(info)
根据获取的信息下载帖子页中的图片和视频。
self.GetInfo()
如下:
def GetInfo(self):
for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
try:
jsonText = e.get_attribute('textContent')
if '"viewerId":null' in jsonText:
jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['PostPage'][0]['graphql']['shortcode_media']
break
except:
continue
uploadTimeStamp = jsonData['taken_at_timestamp']
uploadTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(uploadTimeStamp))
username = jsonData['owner']['username']
fullName = jsonData['owner']['full_name']
likes = jsonData['edge_media_preview_like']['count']
comments = jsonData['edge_media_preview_comment']['count']
try:
text = jsonData['edge_media_to_caption']['edges'][0]['node']['text']
except:
text = ''
try:
mediaDict = {}
for obj in jsonData['edge_sidecar_to_children']['edges']:
try:
vidUrl = obj['node']['video_url']
vidViewCount = obj['node']['video_view_count']
mediaDict[vidUrl] = vidViewCount
except:
picUrl = obj['node']['display_url']
picDescription = obj['node']['accessibility_caption']
mediaDict[picUrl] = picDescription
return uploadTime, username, fullName, likes, comments, text, mediaDict, 'm'
except:
try:
vidUrl = jsonData['video_url']
vidViewCount = jsonData['video_view_count']
return uploadTime, username, fullName, likes, comments, text, vidUrl, vidViewCount, 'v'
except:
picUrl = jsonData['display_url']
picDescription = jsonData['accessibility_caption']
return uploadTime, username, fullName, likes, comments, text, picUrl, picDescription, 'p'
解释:① 无需登录即可获取包含所有信息的JSON数据。② 获取一些通用的信息,如发布时间、用户名、点赞数、评论数等。③ 获取图片或视频的链接和相关信息。帖子可以分为多图(视频)和单图(视频),而在多图(视频)帖子中需要判断是图片还是视频,在单图(视频)帖子中也要判断。
self.DownloadInfo(info)
如下:
def DownloadInfo(self, info):
now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(time.time())))
uploadTime = info[0]
username = info[1]
fullName = info[2]
likes = info[3]
comments = info[4]
text = info[5]
folder = '{}\\{}\\{}'.format(outputPath, username, uploadTime.replace('-', '').replace(':', '').replace(' ', ''))
try:
os.makedirs(folder)
except Exception as e:
print(e)
with open('{}\\info.txt'.format(folder), 'w', encoding='utf-8') as f:
f.write('Now: {}'.format(now))
f.write('\nUpload time: {}'.format(uploadTime))
f.write('\nUsername: {}'.format(username))
f.write('\nFull name: {}'.format(fullName))
f.write('\nText: {}'.format(text))
f.write('\nLikes: {}'.format(likes))
f.write('\nComments: {}'.format(comments))
if info[-1] == 'm':
mediaDict = info[6]
i = 1
for mediaUrl, mediaInfo in mediaDict.items():
if str(mediaInfo).isdigit():
f.write('\n{}. Video view count: {}'.format(str(i), str(mediaInfo)))
f.write('\n{}. Video url: {}'.format(str(i), mediaUrl))
else:
f.write('\n{}. Picture description: {}'.format(str(i), mediaInfo))
f.write('\n{}. Picture url: {}'.format(str(i), mediaUrl))
i += 1
elif info[-1] == 'v':
vidUrl = info[6]
vidViewCount = info[7]
f.write('\nVideo view count: {}'.format(vidViewCount))
f.write('\nVideo url: {}'.format(vidUrl))
elif info[-1] == 'p':
picUrl = info[6]
picDescription = info[7]
f.write('\nPicture description: {}'.format(picDescription))
f.write('\nPicture url: {}'.format(picUrl))
self.DownloadFile(info)
如下:
def DownloadFile(self, info):
uploadTime = info[0]
username = info[1]
folder = '{}\\{}\\{}'.format(outputPath, username, uploadTime.replace('-', '').replace(':', '').replace(' ', ''))
if info[-1] == 'm':
mediaDict = info[6]
i = 1
for mediaUrl, mediaInfo in mediaDict.items():
if str(mediaInfo).isdigit():
os.system('{} --output-document={}\\{}.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, mediaUrl))
else:
os.system('{} --output-document={}\\{}.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, mediaUrl))
i += 1
elif info[-1] == 'v':
vidUrl = info[6]
os.system('{} --output-document={}\\1.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, vidUrl))
elif info[-1] == 'p':
picUrl = info[6]
os.system('{} --output-document={}\\1.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, picUrl))
解释:参考这篇博文。
把上述模块如下整合到类中:
class POST:
def __init__(self, url):
self.url = url
def GetInfo(self):
for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
try:
jsonText = e.get_attribute('textContent')
if 'viewerId' in jsonText:
jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['PostPage'][0]['graphql']['shortcode_media']
break
except:
continue
uploadTimeStamp = jsonData['taken_at_timestamp']
uploadTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(uploadTimeStamp))
username = jsonData['owner']['username']
fullName = jsonData['owner']['full_name']
likes = jsonData['edge_media_preview_like']['count']
comments = jsonData['edge_media_preview_comment']['count']
try:
text = jsonData['edge_media_to_caption']['edges'][0]['node']['text']
except:
text = ''
try:
mediaList = []
for obj in jsonData['edge_sidecar_to_children']['edges']:
try:
vidUrl = obj['node']['video_url']
vidViewCount = obj['node']['video_view_count']
mediaList.append((vidUrl, vidViewCount))
except:
picUrl = obj['node']['display_url']
picDescription = obj['node']['accessibility_caption']
mediaList.append((picUrl, picDescription))
return uploadTime, username, fullName, likes, comments, text, mediaList, 'm'
except:
try:
vidUrl = jsonData['video_url']
vidViewCount = jsonData['video_view_count']
return uploadTime, username, fullName, likes, comments, text, vidUrl, vidViewCount, 'v'
except:
picUrl = jsonData['display_url']
picDescription = jsonData['accessibility_caption']
return uploadTime, username, fullName, likes, comments, text, picUrl, picDescription, 'p'
def DownloadInfo(self, info):
now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(time.time())))
uploadTime = info[0]
username = info[1]
fullName = info[2]
likes = info[3]
comments = info[4]
text = info[5]
folder = '{}\\{}\\{}'.format(outputPath, username, ''.join([x for x in uploadTime if x.isdigit()]))
try:
os.makedirs(folder)
except Exception as e:
print(e)
with open('{}\\info.txt'.format(folder), 'w', encoding='utf-8') as f:
f.write('Now: {}'.format(now))
f.write('\nUpload time: {}'.format(uploadTime))
f.write('\nUsername: {}'.format(username))
f.write('\nFull name: {}'.format(fullName))
f.write('\nText: {}'.format(text))
f.write('\nLikes: {}'.format(likes))
f.write('\nComments: {}'.format(comments))
if info[-1] == 'm':
mediaList = info[6]
i = 1
for mediaTuple in mediaList:
if str(mediaTuple[1]).isdigit():
vidUrl = mediaTuple[0]; vidViewCount = str(mediaTuple[1])
f.write('\n{}. Video view count: {} Video url: {}'.format(str(i), vidViewCount, vidUrl))
else:
picUrl = mediaTuple[0]; picDescription = mediaTuple[1]
f.write('\n{}. Picture description: {} Picture url: {}'.format(str(i), picDescription, picUrl))
i += 1
elif info[-1] == 'v':
vidUrl = info[6]
vidViewCount = info[7]
f.write('\nVideo view count: {}'.format(vidViewCount))
f.write('\nVideo url: {}'.format(vidUrl))
elif info[-1] == 'p':
picUrl = info[6]
picDescription = info[7]
f.write('\nPicture description: {}'.format(picDescription))
f.write('\nPicture url: {}'.format(picUrl))
def DownloadFile(self, info):
uploadTime = info[0]
username = info[1]
folder = '{}\\{}\\{}'.format(outputPath, username, ''.join([x for x in uploadTime if x.isdigit()]))
try:
os.makedirs(folder)
except Exception as e:
print(e)
if info[-1] == 'm':
mediaList = info[6]
i = 1
for mediaTuple in mediaList:
if str(mediaTuple[1]).isdigit():
vidUrl = mediaTuple[0]
os.system('{} --output-document={}\\{}.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, vidUrl))
else:
picUrl = mediaTuple[0]
os.system('{} --output-document={}\\{}.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, picUrl))
i += 1
elif info[-1] == 'v':
vidUrl = info[6]
os.system('{} --output-document={}\\1.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, vidUrl))
elif info[-1] == 'p':
picUrl = info[6]
os.system('{} --output-document={}\\1.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, picUrl))
def Main(self):
try:
fxDriver.get(self.url)
info = self.GetInfo()
self.DownloadInfo(info)
self.DownloadFile(info)
except Exception as e:
print(e)
解释:可以通过执行POST(url).Main()
下载每一个帖子的信息、图片和视频。
4. 完整代码
from selenium import webdriver
import json, time, os
sslPort =
fxBinaryPath = ''
geckodriverPath = ''
pageDownJS = 'document.documentElement.scrollTop = 100000000'
outputPath = ''
wgetPath = ''
httpsProxy = 'https://127.0.0.1:{}/'.format(str(sslPort))
fxProfile = webdriver.firefox.firefox_profile.FirefoxProfile()
fxProfile.set_preference('network.proxy.type', 1)
fxProfile.set_preference('network.proxy.ssl', '127.0.0.1')
fxProfile.set_preference('network.proxy.ssl_port', sslPort)
fxProfile.set_preference('network.proxy.socks_remote_dns', True)
fxProfile.set_preference('network.trr.mode', 2)
fxProfile.set_preference('permissions.default.image', 2)
fxProfile.set_preference('intl.accept_languages', 'zh-CN, zh, zh-TW, zh-HK, en-US, en')
fxDriver = webdriver.firefox.webdriver.WebDriver(firefox_profile=fxProfile, firefox_binary=fxBinaryPath, executable_path=geckodriverPath)
class POST:
def __init__(self, url):
self.url = url
def GetInfo(self):
for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
try:
jsonText = e.get_attribute('textContent')
if 'viewerId' in jsonText:
jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['PostPage'][0]['graphql']['shortcode_media']
break
except:
continue
uploadTimeStamp = jsonData['taken_at_timestamp']
uploadTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(uploadTimeStamp))
username = jsonData['owner']['username']
fullName = jsonData['owner']['full_name']
likes = jsonData['edge_media_preview_like']['count']
comments = jsonData['edge_media_preview_comment']['count']
try:
text = jsonData['edge_media_to_caption']['edges'][0]['node']['text']
except:
text = ''
try:
mediaList = []
for obj in jsonData['edge_sidecar_to_children']['edges']:
try:
vidUrl = obj['node']['video_url']
vidViewCount = obj['node']['video_view_count']
mediaList.append((vidUrl, vidViewCount))
except:
picUrl = obj['node']['display_url']
picDescription = obj['node']['accessibility_caption']
mediaList.append((picUrl, picDescription))
return uploadTime, username, fullName, likes, comments, text, mediaList, 'm'
except:
try:
vidUrl = jsonData['video_url']
vidViewCount = jsonData['video_view_count']
return uploadTime, username, fullName, likes, comments, text, vidUrl, vidViewCount, 'v'
except:
picUrl = jsonData['display_url']
picDescription = jsonData['accessibility_caption']
return uploadTime, username, fullName, likes, comments, text, picUrl, picDescription, 'p'
def DownloadInfo(self, info):
now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(time.time())))
uploadTime = info[0]
username = info[1]
fullName = info[2]
likes = info[3]
comments = info[4]
text = info[5]
folder = '{}\\{}\\{}'.format(outputPath, username, ''.join([x for x in uploadTime if x.isdigit()]))
try:
os.makedirs(folder)
except Exception as e:
print(e)
with open('{}\\info.txt'.format(folder), 'w', encoding='utf-8') as f:
f.write('Now: {}'.format(now))
f.write('\nUpload time: {}'.format(uploadTime))
f.write('\nUsername: {}'.format(username))
f.write('\nFull name: {}'.format(fullName))
f.write('\nText: {}'.format(text))
f.write('\nLikes: {}'.format(likes))
f.write('\nComments: {}'.format(comments))
if info[-1] == 'm':
mediaList = info[6]
i = 1
for mediaTuple in mediaList:
if str(mediaTuple[1]).isdigit():
vidUrl = mediaTuple[0]; vidViewCount = str(mediaTuple[1])
f.write('\n{}. Video view count: {} Video url: {}'.format(str(i), vidViewCount, vidUrl))
else:
picUrl = mediaTuple[0]; picDescription = mediaTuple[1]
f.write('\n{}. Picture description: {} Picture url: {}'.format(str(i), picDescription, picUrl))
i += 1
elif info[-1] == 'v':
vidUrl = info[6]
vidViewCount = info[7]
f.write('\nVideo view count: {}'.format(vidViewCount))
f.write('\nVideo url: {}'.format(vidUrl))
elif info[-1] == 'p':
picUrl = info[6]
picDescription = info[7]
f.write('\nPicture description: {}'.format(picDescription))
f.write('\nPicture url: {}'.format(picUrl))
def DownloadFile(self, info):
uploadTime = info[0]
username = info[1]
folder = '{}\\{}\\{}'.format(outputPath, username, ''.join([x for x in uploadTime if x.isdigit()]))
try:
os.makedirs(folder)
except Exception as e:
print(e)
if info[-1] == 'm':
mediaList = info[6]
i = 1
for mediaTuple in mediaList:
if str(mediaTuple[1]).isdigit():
vidUrl = mediaTuple[0]
os.system('{} --output-document={}\\{}.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, vidUrl))
else:
picUrl = mediaTuple[0]
os.system('{} --output-document={}\\{}.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, picUrl))
i += 1
elif info[-1] == 'v':
vidUrl = info[6]
os.system('{} --output-document={}\\1.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, vidUrl))
elif info[-1] == 'p':
picUrl = info[6]
os.system('{} --output-document={}\\1.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, picUrl))
def Main(self):
try:
fxDriver.get(self.url)
info = self.GetInfo()
self.DownloadInfo(info)
self.DownloadFile(info)
except Exception as e:
print(e)
class PROFILE:
def __init__(self, profileUrl):
self.profileUrl = profileUrl
def Update(self):
for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
try:
jsonText = e.get_attribute('textContent')
if 'viewerId' in jsonText:
jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['ProfilePage'][0]['graphql']['user']
break
except:
continue
postCount = jsonData['edge_owner_to_timeline_media']['count']
username = jsonData['username']
folder = '{}\\{}'.format(outputPath, username)
if os.path.exists(folder):
downloadCount = len([x for x in os.listdir(folder) if os.path.isdir('{}\\{}'.format(folder, x))])
else:
downloadCount = 0
updateCount = postCount - downloadCount
return updateCount
def GetLocY(self):
urlDict = {}
for e in fxDriver.find_elements_by_xpath('//a[contains(@href, "/p/")]'):
locY = e.location['y']
locX = e.location['x']
url = e.get_attribute('href')
urlDict[url] = locX/1000 + locY
return locY, urlDict
def JudgeLoading(self, locY, urlDict):
time.sleep(0.5)
locYNew, urlDictNew = self.GetLocY()
if locYNew > locY:
urlDictNew.update(urlDict)
else:
locYNew = None
return locYNew, urlDictNew
def GetWholePage(self):
updateCount = self.Update()
fxDriver.execute_script(pageDownJS)
try:
fxDriver.find_element_by_xpath('//div[contains(text(), "更多帖子")]').click()
except Exception as e:
print(e)
locY, urlDict = self.GetLocY()
while 1:
fxDriver.execute_script(pageDownJS)
while 1:
locYNew, urlDictNew = self.JudgeLoading(locY, urlDict)
urlList = [t[0] for t in sorted(urlDictNew.items(), key = lambda x:x[1])]
if len(urlList) >= updateCount:
return urlList[: updateCount]
if locYNew == None:
continue
else:
locY = locYNew
urlDict = urlDictNew
break
def Main(self):
try:
fxDriver.get(self.profileUrl)
urlList = self.GetWholePage()
return urlList
except Exception as e:
print(e)
def Main():
inputUrl = input('Please input the instagram link: ')
if '/p/' in inputUrl:
POST(inputUrl).Main()
else:
if not 'www.instagram.com' in inputUrl:
inputUrl = 'https://www.instagram.com/{}/'.format(inputUrl)
urlList = PROFILE(inputUrl).Main()
if urlList:
for url in urlList:
POST(url).Main()
Main()
if __name__ == '__main__':
Main()