目錄 |
---|
0. 項目介紹 |
1. 爬取賬戶頁中所有帖子的鏈接 |
2. 爬取並下載帖子頁中的信息、圖片和視頻 |
3. 完整代碼 |
0. 項目介紹
本項目的目的是輸入Instagram賬號或賬戶頁鏈接或帖子頁鏈接,輸出該賬戶帖子的:① 包含當前時間、用戶上傳帖子的時間(當前時區)、用戶名稱(Username)、用戶全稱(Full name)、帖子文字、點贊數、評論數、圖片描述(當帖子中有圖片時)、圖片鏈接(當帖子中有圖片時)、視頻觀看數(當帖子中有視頻時)、視頻鏈接(當帖子中有視頻時)的文本文檔;② 圖片(當帖子中有圖片時)、視頻(當帖子中有視頻時)。
本項目需要先導入如下庫:
from selenium import webdriver
from multiprocessing import Pool
import json, time, os
本項目的全局變量如下:
sslPort =
fxBinaryPath = ''
geckodriverPath = ''
pageDownJS = 'document.documentElement.scrollTop = 100000000'
outputPath = ''
ariaPath = ''
httpsProxy = 'https://127.0.0.1:{}/'.format(str(sslPort))
解釋:
sslPort
:可用於訪問Instagram的HTTPS代理的本地端口。fxBinaryPath
:Firefox瀏覽器的firefox.exe
的絕對路徑。geckodriverPath
:geckodriver.exe
的絕對路徑。pageDownJS
:用於下拉頁面的JavaScript代碼。outputPath
:輸出路徑。ariaPath
:aria2c.exe
的絕對路徑。httpsProxy
:用於GNU Wget for Windows的HTTPS代理。
本項目的基本結構如下:
def Driver():
# Driver函數用於構造Firefox瀏覽器實例,輸出瀏覽器實例
class DOWNLOAD:
# DOWNLOAD類是一個多進程下載工具
class POST:
# POST類用於爬取並下載帖子頁中的信息、圖片和視頻
class PROFILE:
# PROFILE類用於爬取賬戶頁中所有帖子的鏈接
def Main():
# Main函數是主函數,輸入Instagram賬號或賬戶頁鏈接或帖子頁鏈接,控制各類和函數
if __name__ == '__main__':
Main()
本項目的運行流程可見Main函數:
def Main():
fxDriver = Driver()
inputUrl = input('Please input instagram link or username: ')
if '/p/' in inputUrl:
POST(fxDriver, inputUrl).Main()
else:
if not 'www.instagram.com' in inputUrl:
inputUrl = 'https://www.instagram.com/{}/'.format(inputUrl)
urlList = PROFILE(fxDriver, inputUrl).Main()
if urlList:
l = len(urlList)
i = 0
for url in urlList:
POST(fxDriver, url).Main()
i += 1
print('\n\n{:.2f} % completed.\n\n'.format(i / l * 100))
fxDriver.quit()
Main()
1. 爬取賬戶頁中所有帖子的鏈接
這一步的基本結構如下:
def Main(self):
try:
fxDriver.get(self.profileUrl)
urlList = self.GetWholePage()
return urlList
except Exception as e:
print(e)
解釋:① 瀏覽器訪問賬戶頁。② self.GetWholePage()
負責爬取賬戶頁中所有帖子的鏈接,生成鏈接列表urlList
。
self.GetWholePage()
如下:
def GetWholePage(self):
updateCount = self.Update()
fxDriver.execute_script(pageDownJS)
try:
fxDriver.find_element_by_xpath('//div[contains(text(), "更多帖子")]').click()
except Exception as e:
print(e)
locY, urlDict = self.GetLocY()
while 1:
fxDriver.execute_script(pageDownJS)
while 1:
locYNew, urlDictNew = self.JudgeLoading(locY, urlDict)
urlList = [t[0] for t in sorted(urlDictNew.items(), key = lambda x:x[1])]
if len(urlList) >= updateCount:
return urlList[: updateCount]
if locYNew == None:
continue
else:
locY = locYNew
urlDict = urlDictNew
break
解釋:
self.Update()
用於計算需要更新的貼子數。fxDriver.execute_script(pageDownJS)
可以通過執行JS代碼pageDownJS
把頁面拉到最下面。self.GetLocY()
可以獲得賬戶頁HTML中每個帖子鏈接所在tag的Y座標locY
和當前加載的所有帖子的鏈接字典urlDict
。self.JudgeLoading(locY, urlDict)
可以對比輸入的Y座標和0.5秒之後的Y座標來判斷pageDownJS
有沒有執行完畢。
self.Update()
如下:
def Update(self):
for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
try:
jsonText = e.get_attribute('textContent')
if 'viewerId' in jsonText:
jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['ProfilePage'][0]['graphql']['user']
break
except:
continue
postCount = jsonData['edge_owner_to_timeline_media']['count']
username = jsonData['username']
folder = '{}\\{}'.format(outputPath, username)
if os.path.exists(folder):
downloadCount = len([x for x in os.listdir(folder) if os.path.isdir('{}\\{}'.format(folder, x))])
else:
downloadCount = 0
updateCount = postCount - downloadCount
return updateCount
解釋:① 解析網頁中的貼子數。② 統計已經下載了多少帖子。③ 計算需要更新的貼子數。
self.GetLocY()
如下:
def GetLocY(self):
urlDict = {}
for e in fxDriver.find_elements_by_xpath('//a[contains(@href, "/p/")]'):
locY = e.location['y']
locX = e.location['x']
url = e.get_attribute('href')
urlDict[url] = locX/1000 + locY
return locY, urlDict
解釋:通過循環判斷'/p/'
有沒有在a
標籤的'href'
屬性中來獲得帖子鏈接及其所在tag的Y座標。
self.JudgeLoading(locY, urlDict)
如下:
def JudgeLoading(self, locY, urlDict):
time.sleep(0.5)
locYNew, urlDictNew = self.GetLocY()
if locYNew > locY:
urlDictNew.update(urlDict)
else:
locYNew = None
return locYNew, urlDictNew
把上述模塊如下整合到類中:
class PROFILE:
def __init__(self, profileUrl):
self.profileUrl = profileUrl
def Update(self):
for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
try:
jsonText = e.get_attribute('textContent')
if 'viewerId' in jsonText:
jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['ProfilePage'][0]['graphql']['user']
break
except:
continue
postCount = jsonData['edge_owner_to_timeline_media']['count']
username = jsonData['username']
folder = '{}\\{}'.format(outputPath, username)
if os.path.exists(folder):
downloadCount = len([x for x in os.listdir(folder) if os.path.isdir('{}\\{}'.format(folder, x))])
else:
downloadCount = 0
updateCount = postCount - downloadCount
return updateCount
def GetLocY(self):
urlDict = {}
for e in fxDriver.find_elements_by_xpath('//a[contains(@href, "/p/")]'):
locY = e.location['y']
locX = e.location['x']
url = e.get_attribute('href')
urlDict[url] = locX/1000 + locY
return locY, urlDict
def JudgeLoading(self, locY, urlDict):
time.sleep(0.5)
locYNew, urlDictNew = self.GetLocY()
if locYNew > locY:
urlDictNew.update(urlDict)
else:
locYNew = None
return locYNew, urlDictNew
def GetWholePage(self):
updateCount = self.Update()
fxDriver.execute_script(pageDownJS)
try:
fxDriver.find_element_by_xpath('//div[contains(text(), "更多帖子")]').click()
except Exception as e:
print(e)
locY, urlDict = self.GetLocY()
while 1:
fxDriver.execute_script(pageDownJS)
while 1:
locYNew, urlDictNew = self.JudgeLoading(locY, urlDict)
urlList = [t[0] for t in sorted(urlDictNew.items(), key = lambda x:x[1])]
if len(urlList) >= updateCount:
return urlList[: updateCount]
if locYNew == None:
continue
else:
locY = locYNew
urlDict = urlDictNew
break
def Main(self):
try:
fxDriver.get(self.profileUrl)
urlList = self.GetWholePage()
return urlList
except Exception as e:
print(e)
解釋:可以通過執行PROFILE(inputUrl).Main()
獲得賬戶頁中所有帖子的鏈接。
3. 爬取並下載帖子頁中的信息、圖片和視頻
這一步的基本結構如下:
def Main(self):
try:
fxDriver.get(self.url)
info = self.GetInfo()
self.DownloadInfo(info)
self.DownloadFile(info)
except Exception as e:
print(e)
解釋:① 瀏覽器訪問帖子頁。② self.GetInfo()
可以獲得用戶上傳帖子的時間(當前時區)、用戶名稱(Username)、用戶全稱(Full name)、帖子文字、點贊數、評論數、圖片描述(當帖子中有圖片時)、圖片鏈接(當帖子中有圖片時)、視頻觀看數(當帖子中有視頻時)、視頻鏈接(當帖子中有視頻時)等信息。③ self.DownloadInfo(info)
把信息寫入文本文檔。④ self.DownloadFile(info)
根據獲取的信息下載帖子頁中的圖片和視頻。
self.GetInfo()
如下:
def GetInfo(self):
for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
try:
jsonText = e.get_attribute('textContent')
if '"viewerId":null' in jsonText:
jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['PostPage'][0]['graphql']['shortcode_media']
break
except:
continue
uploadTimeStamp = jsonData['taken_at_timestamp']
uploadTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(uploadTimeStamp))
username = jsonData['owner']['username']
fullName = jsonData['owner']['full_name']
likes = jsonData['edge_media_preview_like']['count']
comments = jsonData['edge_media_preview_comment']['count']
try:
text = jsonData['edge_media_to_caption']['edges'][0]['node']['text']
except:
text = ''
try:
mediaDict = {}
for obj in jsonData['edge_sidecar_to_children']['edges']:
try:
vidUrl = obj['node']['video_url']
vidViewCount = obj['node']['video_view_count']
mediaDict[vidUrl] = vidViewCount
except:
picUrl = obj['node']['display_url']
picDescription = obj['node']['accessibility_caption']
mediaDict[picUrl] = picDescription
return uploadTime, username, fullName, likes, comments, text, mediaDict, 'm'
except:
try:
vidUrl = jsonData['video_url']
vidViewCount = jsonData['video_view_count']
return uploadTime, username, fullName, likes, comments, text, vidUrl, vidViewCount, 'v'
except:
picUrl = jsonData['display_url']
picDescription = jsonData['accessibility_caption']
return uploadTime, username, fullName, likes, comments, text, picUrl, picDescription, 'p'
解釋:① 無需登錄即可獲取包含所有信息的JSON數據。② 獲取一些通用的信息,如發佈時間、用戶名、點贊數、評論數等。③ 獲取圖片或視頻的鏈接和相關信息。帖子可以分爲多圖(視頻)和單圖(視頻),而在多圖(視頻)帖子中需要判斷是圖片還是視頻,在單圖(視頻)帖子中也要判斷。
self.DownloadInfo(info)
如下:
def DownloadInfo(self, info):
now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(time.time())))
uploadTime = info[0]
username = info[1]
fullName = info[2]
likes = info[3]
comments = info[4]
text = info[5]
folder = '{}\\{}\\{}'.format(outputPath, username, uploadTime.replace('-', '').replace(':', '').replace(' ', ''))
try:
os.makedirs(folder)
except Exception as e:
print(e)
with open('{}\\info.txt'.format(folder), 'w', encoding='utf-8') as f:
f.write('Now: {}'.format(now))
f.write('\nUpload time: {}'.format(uploadTime))
f.write('\nUsername: {}'.format(username))
f.write('\nFull name: {}'.format(fullName))
f.write('\nText: {}'.format(text))
f.write('\nLikes: {}'.format(likes))
f.write('\nComments: {}'.format(comments))
if info[-1] == 'm':
mediaDict = info[6]
i = 1
for mediaUrl, mediaInfo in mediaDict.items():
if str(mediaInfo).isdigit():
f.write('\n{}. Video view count: {}'.format(str(i), str(mediaInfo)))
f.write('\n{}. Video url: {}'.format(str(i), mediaUrl))
else:
f.write('\n{}. Picture description: {}'.format(str(i), mediaInfo))
f.write('\n{}. Picture url: {}'.format(str(i), mediaUrl))
i += 1
elif info[-1] == 'v':
vidUrl = info[6]
vidViewCount = info[7]
f.write('\nVideo view count: {}'.format(vidViewCount))
f.write('\nVideo url: {}'.format(vidUrl))
elif info[-1] == 'p':
picUrl = info[6]
picDescription = info[7]
f.write('\nPicture description: {}'.format(picDescription))
f.write('\nPicture url: {}'.format(picUrl))
self.DownloadFile(info)
如下:
def DownloadFile(self, info):
uploadTime = info[0]
username = info[1]
folder = '{}\\{}\\{}'.format(outputPath, username, uploadTime.replace('-', '').replace(':', '').replace(' ', ''))
if info[-1] == 'm':
mediaDict = info[6]
i = 1
for mediaUrl, mediaInfo in mediaDict.items():
if str(mediaInfo).isdigit():
os.system('{} --output-document={}\\{}.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, mediaUrl))
else:
os.system('{} --output-document={}\\{}.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, mediaUrl))
i += 1
elif info[-1] == 'v':
vidUrl = info[6]
os.system('{} --output-document={}\\1.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, vidUrl))
elif info[-1] == 'p':
picUrl = info[6]
os.system('{} --output-document={}\\1.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, picUrl))
解釋:參考這篇博文。
把上述模塊如下整合到類中:
class POST:
def __init__(self, url):
self.url = url
def GetInfo(self):
for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
try:
jsonText = e.get_attribute('textContent')
if 'viewerId' in jsonText:
jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['PostPage'][0]['graphql']['shortcode_media']
break
except:
continue
uploadTimeStamp = jsonData['taken_at_timestamp']
uploadTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(uploadTimeStamp))
username = jsonData['owner']['username']
fullName = jsonData['owner']['full_name']
likes = jsonData['edge_media_preview_like']['count']
comments = jsonData['edge_media_preview_comment']['count']
try:
text = jsonData['edge_media_to_caption']['edges'][0]['node']['text']
except:
text = ''
try:
mediaList = []
for obj in jsonData['edge_sidecar_to_children']['edges']:
try:
vidUrl = obj['node']['video_url']
vidViewCount = obj['node']['video_view_count']
mediaList.append((vidUrl, vidViewCount))
except:
picUrl = obj['node']['display_url']
picDescription = obj['node']['accessibility_caption']
mediaList.append((picUrl, picDescription))
return uploadTime, username, fullName, likes, comments, text, mediaList, 'm'
except:
try:
vidUrl = jsonData['video_url']
vidViewCount = jsonData['video_view_count']
return uploadTime, username, fullName, likes, comments, text, vidUrl, vidViewCount, 'v'
except:
picUrl = jsonData['display_url']
picDescription = jsonData['accessibility_caption']
return uploadTime, username, fullName, likes, comments, text, picUrl, picDescription, 'p'
def DownloadInfo(self, info):
now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(time.time())))
uploadTime = info[0]
username = info[1]
fullName = info[2]
likes = info[3]
comments = info[4]
text = info[5]
folder = '{}\\{}\\{}'.format(outputPath, username, ''.join([x for x in uploadTime if x.isdigit()]))
try:
os.makedirs(folder)
except Exception as e:
print(e)
with open('{}\\info.txt'.format(folder), 'w', encoding='utf-8') as f:
f.write('Now: {}'.format(now))
f.write('\nUpload time: {}'.format(uploadTime))
f.write('\nUsername: {}'.format(username))
f.write('\nFull name: {}'.format(fullName))
f.write('\nText: {}'.format(text))
f.write('\nLikes: {}'.format(likes))
f.write('\nComments: {}'.format(comments))
if info[-1] == 'm':
mediaList = info[6]
i = 1
for mediaTuple in mediaList:
if str(mediaTuple[1]).isdigit():
vidUrl = mediaTuple[0]; vidViewCount = str(mediaTuple[1])
f.write('\n{}. Video view count: {} Video url: {}'.format(str(i), vidViewCount, vidUrl))
else:
picUrl = mediaTuple[0]; picDescription = mediaTuple[1]
f.write('\n{}. Picture description: {} Picture url: {}'.format(str(i), picDescription, picUrl))
i += 1
elif info[-1] == 'v':
vidUrl = info[6]
vidViewCount = info[7]
f.write('\nVideo view count: {}'.format(vidViewCount))
f.write('\nVideo url: {}'.format(vidUrl))
elif info[-1] == 'p':
picUrl = info[6]
picDescription = info[7]
f.write('\nPicture description: {}'.format(picDescription))
f.write('\nPicture url: {}'.format(picUrl))
def DownloadFile(self, info):
uploadTime = info[0]
username = info[1]
folder = '{}\\{}\\{}'.format(outputPath, username, ''.join([x for x in uploadTime if x.isdigit()]))
try:
os.makedirs(folder)
except Exception as e:
print(e)
if info[-1] == 'm':
mediaList = info[6]
i = 1
for mediaTuple in mediaList:
if str(mediaTuple[1]).isdigit():
vidUrl = mediaTuple[0]
os.system('{} --output-document={}\\{}.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, vidUrl))
else:
picUrl = mediaTuple[0]
os.system('{} --output-document={}\\{}.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, picUrl))
i += 1
elif info[-1] == 'v':
vidUrl = info[6]
os.system('{} --output-document={}\\1.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, vidUrl))
elif info[-1] == 'p':
picUrl = info[6]
os.system('{} --output-document={}\\1.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, picUrl))
def Main(self):
try:
fxDriver.get(self.url)
info = self.GetInfo()
self.DownloadInfo(info)
self.DownloadFile(info)
except Exception as e:
print(e)
解釋:可以通過執行POST(url).Main()
下載每一個帖子的信息、圖片和視頻。
4. 完整代碼
from selenium import webdriver
import json, time, os
sslPort =
fxBinaryPath = ''
geckodriverPath = ''
pageDownJS = 'document.documentElement.scrollTop = 100000000'
outputPath = ''
wgetPath = ''
httpsProxy = 'https://127.0.0.1:{}/'.format(str(sslPort))
fxProfile = webdriver.firefox.firefox_profile.FirefoxProfile()
fxProfile.set_preference('network.proxy.type', 1)
fxProfile.set_preference('network.proxy.ssl', '127.0.0.1')
fxProfile.set_preference('network.proxy.ssl_port', sslPort)
fxProfile.set_preference('network.proxy.socks_remote_dns', True)
fxProfile.set_preference('network.trr.mode', 2)
fxProfile.set_preference('permissions.default.image', 2)
fxProfile.set_preference('intl.accept_languages', 'zh-CN, zh, zh-TW, zh-HK, en-US, en')
fxDriver = webdriver.firefox.webdriver.WebDriver(firefox_profile=fxProfile, firefox_binary=fxBinaryPath, executable_path=geckodriverPath)
class POST:
def __init__(self, url):
self.url = url
def GetInfo(self):
for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
try:
jsonText = e.get_attribute('textContent')
if 'viewerId' in jsonText:
jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['PostPage'][0]['graphql']['shortcode_media']
break
except:
continue
uploadTimeStamp = jsonData['taken_at_timestamp']
uploadTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(uploadTimeStamp))
username = jsonData['owner']['username']
fullName = jsonData['owner']['full_name']
likes = jsonData['edge_media_preview_like']['count']
comments = jsonData['edge_media_preview_comment']['count']
try:
text = jsonData['edge_media_to_caption']['edges'][0]['node']['text']
except:
text = ''
try:
mediaList = []
for obj in jsonData['edge_sidecar_to_children']['edges']:
try:
vidUrl = obj['node']['video_url']
vidViewCount = obj['node']['video_view_count']
mediaList.append((vidUrl, vidViewCount))
except:
picUrl = obj['node']['display_url']
picDescription = obj['node']['accessibility_caption']
mediaList.append((picUrl, picDescription))
return uploadTime, username, fullName, likes, comments, text, mediaList, 'm'
except:
try:
vidUrl = jsonData['video_url']
vidViewCount = jsonData['video_view_count']
return uploadTime, username, fullName, likes, comments, text, vidUrl, vidViewCount, 'v'
except:
picUrl = jsonData['display_url']
picDescription = jsonData['accessibility_caption']
return uploadTime, username, fullName, likes, comments, text, picUrl, picDescription, 'p'
def DownloadInfo(self, info):
now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(time.time())))
uploadTime = info[0]
username = info[1]
fullName = info[2]
likes = info[3]
comments = info[4]
text = info[5]
folder = '{}\\{}\\{}'.format(outputPath, username, ''.join([x for x in uploadTime if x.isdigit()]))
try:
os.makedirs(folder)
except Exception as e:
print(e)
with open('{}\\info.txt'.format(folder), 'w', encoding='utf-8') as f:
f.write('Now: {}'.format(now))
f.write('\nUpload time: {}'.format(uploadTime))
f.write('\nUsername: {}'.format(username))
f.write('\nFull name: {}'.format(fullName))
f.write('\nText: {}'.format(text))
f.write('\nLikes: {}'.format(likes))
f.write('\nComments: {}'.format(comments))
if info[-1] == 'm':
mediaList = info[6]
i = 1
for mediaTuple in mediaList:
if str(mediaTuple[1]).isdigit():
vidUrl = mediaTuple[0]; vidViewCount = str(mediaTuple[1])
f.write('\n{}. Video view count: {} Video url: {}'.format(str(i), vidViewCount, vidUrl))
else:
picUrl = mediaTuple[0]; picDescription = mediaTuple[1]
f.write('\n{}. Picture description: {} Picture url: {}'.format(str(i), picDescription, picUrl))
i += 1
elif info[-1] == 'v':
vidUrl = info[6]
vidViewCount = info[7]
f.write('\nVideo view count: {}'.format(vidViewCount))
f.write('\nVideo url: {}'.format(vidUrl))
elif info[-1] == 'p':
picUrl = info[6]
picDescription = info[7]
f.write('\nPicture description: {}'.format(picDescription))
f.write('\nPicture url: {}'.format(picUrl))
def DownloadFile(self, info):
uploadTime = info[0]
username = info[1]
folder = '{}\\{}\\{}'.format(outputPath, username, ''.join([x for x in uploadTime if x.isdigit()]))
try:
os.makedirs(folder)
except Exception as e:
print(e)
if info[-1] == 'm':
mediaList = info[6]
i = 1
for mediaTuple in mediaList:
if str(mediaTuple[1]).isdigit():
vidUrl = mediaTuple[0]
os.system('{} --output-document={}\\{}.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, vidUrl))
else:
picUrl = mediaTuple[0]
os.system('{} --output-document={}\\{}.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, str(i), httpsProxy, picUrl))
i += 1
elif info[-1] == 'v':
vidUrl = info[6]
os.system('{} --output-document={}\\1.mp4 --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, vidUrl))
elif info[-1] == 'p':
picUrl = info[6]
os.system('{} --output-document={}\\1.jpg --no-check-certificate --execute https_proxy={} --execute robots=off --continue "{}"'.format(wgetPath, folder, httpsProxy, picUrl))
def Main(self):
try:
fxDriver.get(self.url)
info = self.GetInfo()
self.DownloadInfo(info)
self.DownloadFile(info)
except Exception as e:
print(e)
class PROFILE:
def __init__(self, profileUrl):
self.profileUrl = profileUrl
def Update(self):
for e in fxDriver.find_elements_by_xpath('//script[@type="text/javascript"]'):
try:
jsonText = e.get_attribute('textContent')
if 'viewerId' in jsonText:
jsonData = json.loads(jsonText[jsonText.find('{'): jsonText.rfind('}') + 1])['entry_data']['ProfilePage'][0]['graphql']['user']
break
except:
continue
postCount = jsonData['edge_owner_to_timeline_media']['count']
username = jsonData['username']
folder = '{}\\{}'.format(outputPath, username)
if os.path.exists(folder):
downloadCount = len([x for x in os.listdir(folder) if os.path.isdir('{}\\{}'.format(folder, x))])
else:
downloadCount = 0
updateCount = postCount - downloadCount
return updateCount
def GetLocY(self):
urlDict = {}
for e in fxDriver.find_elements_by_xpath('//a[contains(@href, "/p/")]'):
locY = e.location['y']
locX = e.location['x']
url = e.get_attribute('href')
urlDict[url] = locX/1000 + locY
return locY, urlDict
def JudgeLoading(self, locY, urlDict):
time.sleep(0.5)
locYNew, urlDictNew = self.GetLocY()
if locYNew > locY:
urlDictNew.update(urlDict)
else:
locYNew = None
return locYNew, urlDictNew
def GetWholePage(self):
updateCount = self.Update()
fxDriver.execute_script(pageDownJS)
try:
fxDriver.find_element_by_xpath('//div[contains(text(), "更多帖子")]').click()
except Exception as e:
print(e)
locY, urlDict = self.GetLocY()
while 1:
fxDriver.execute_script(pageDownJS)
while 1:
locYNew, urlDictNew = self.JudgeLoading(locY, urlDict)
urlList = [t[0] for t in sorted(urlDictNew.items(), key = lambda x:x[1])]
if len(urlList) >= updateCount:
return urlList[: updateCount]
if locYNew == None:
continue
else:
locY = locYNew
urlDict = urlDictNew
break
def Main(self):
try:
fxDriver.get(self.profileUrl)
urlList = self.GetWholePage()
return urlList
except Exception as e:
print(e)
def Main():
inputUrl = input('Please input the instagram link: ')
if '/p/' in inputUrl:
POST(inputUrl).Main()
else:
if not 'www.instagram.com' in inputUrl:
inputUrl = 'https://www.instagram.com/{}/'.format(inputUrl)
urlList = PROFILE(inputUrl).Main()
if urlList:
for url in urlList:
POST(url).Main()
Main()
if __name__ == '__main__':
Main()