urls.txt 文件
https://regex101.com/
https://docs.python.org/3/this-url-will-404.html
https://www.nytimes.com/guides/
https://www.mediamatters.org/
https://1.1.1.1/
https://www.politico.com/tipsheets/morning-money
https://www.bloomberg.com/markets/economics
https://www.ietf.org/rfc/rfc2616.txt
import sys
import aiohttp
import asyncio
import logging
from typing import IO
from aiohttp import ClientSession
logging.basicConfig(
format="%(asctime)s %(levelname)s:%(name)s: %(message)s",
level=logging.DEBUG, # 設置日誌級別
stream=sys.stderr,
)
logger = logging.getLogger("areq")
logging.getLogger("chardet.charsetprober").disabled = True
async def fetch_html(url: str, session: ClientSession, **kwargs) -> str: # 獲取源碼
resp = await session.request(method="GET", url=url, **kwargs)
resp.raise_for_status() # 拋出狀態類型異常
logger.info("響應碼:[%s] URL: %s", resp.status, url)
html = await resp.text()
return html
async def parse(url: str, session: ClientSession, **kwargs) -> set: # 解析鏈接
from urllib.parse import urljoin
from urllib.error import URLError
from aiohttp import ClientError
from aiohttp.http_exceptions import HttpProcessingError
found = set()
try:
html = await fetch_html(url=url, session=session, **kwargs)
except (ClientError, HttpProcessingError) as e:
logger.error(
"aiohttp異常 %s [%s]: %s", url,
getattr(e, "status", None), getattr(e, "message", None))
return found
except Exception as e:
logger.exception("發生非aiohttp異常: %s", getattr(e, "__dict__", {}))
return found
else:
import re
HREF_RE = re.compile(r'href="(.*?)"')
# 掃描當前頁面的全部鏈接
for link in HREF_RE.findall(html):
try:
# 拼接鏈接,適合掃描全站
abslink = urljoin(url, link)
except (URLError, ValueError):
logger.exception("解析錯誤的鏈接: %s", link)
pass
else:
# 添加鏈接
found.add(abslink)
logger.info(f"在 {url}發現 {len(found)}條鏈接")
# 當前頁面的全部鏈接
return found
async def write_one(file: IO, url: str, **kwargs) -> None:
import aiofiles
res = await parse(url=url, **kwargs)
if not res:
return None # 沒有響應數據
async with aiofiles.open(file, "a") as f:
for p in res:
await f.write(f"{url}\t{p}\n")
logger.info("根據資源鏈接寫入結果: %s", url)
async def main(file: IO, urls: set, **kwargs)->None:
async with ClientSession() as session:
tasks = [write_one(file, url, session=session, **kwargs)
for url in urls]
await asyncio.gather(*tasks)
if __name__ == '__main__':
import pathlib
# 判斷版本信息大於3.7版本
assert sys.version_info >= (3, 7), "Script requires Python 3.7+."
# 獲取當前文件目錄路徑
here = pathlib.Path(__file__).parent
# joinpath:合併完整的文件夾路徑
with open(here.joinpath("urls.txt")) as infile:
# 讀取全部數據,形成一個字典
urls = set(map(str.strip, infile))
outpath = here.joinpath("foundurls.txt")
with open(outpath, "w") as outfile:
outfile.write("source_url\tparsed_url\n")
# 運行協程
asyncio.run(main(file=outpath, urls=urls))
輸出:
2019-12-16 20:41:28,711 DEBUG:asyncio: Using selector: SelectSelector
2019-12-16 20:41:29,662 INFO:areq: 響應碼:[200] URL: https://www.ietf.org/rfc/rfc2616.txt
2019-12-16 20:41:29,746 INFO:areq: 響應碼:[200] URL: https://1.1.1.1/
2019-12-16 20:41:30,153 INFO:areq: 在 https://1.1.1.1/發現 13條鏈接
2019-12-16 20:41:30,164 INFO:areq: 在 https://www.ietf.org/rfc/rfc2616.txt發現 0條鏈接
2019-12-16 20:41:30,172 INFO:areq: 根據資源鏈接寫入結果: https://1.1.1.1/
2019-12-16 20:41:30,342 INFO:areq: 響應碼:[200] URL: https://www.mediamatters.org/
2019-12-16 20:41:30,786 INFO:areq: 響應碼:[200] URL: https://regex101.com/
2019-12-16 20:41:30,850 INFO:areq: 在 https://www.mediamatters.org/發現 116條鏈接
2019-12-16 20:41:30,871 INFO:areq: 根據資源鏈接寫入結果: https://www.mediamatters.org/
2019-12-16 20:41:31,292 INFO:areq: 響應碼:[200] URL: https://www.politico.com/tipsheets/morning-money
2019-12-16 20:41:31,367 INFO:areq: 在 https://www.politico.com/tipsheets/morning-money發現 149條鏈接
2019-12-16 20:41:31,400 INFO:areq: 根據資源鏈接寫入結果: https://www.politico.com/tipsheets/morning-money
2019-12-16 20:41:33,031 INFO:areq: 在 https://regex101.com/發現 24條鏈接
2019-12-16 20:41:33,036 INFO:areq: 根據資源鏈接寫入結果: https://regex101.com/
2019-12-16 20:41:36,827 ERROR:areq: aiohttp異常 https://docs.python.org/3/this-url-will-404.html [404]: Not Found
2019-12-16 20:41:50,115 ERROR:areq: aiohttp異常 https://www.nytimes.com/guides/ [None]: None
2019-12-16 20:41:50,116 ERROR:areq: aiohttp異常 https://www.bloomberg.com/markets/economics [None]: None
[Finished in 23.9s]