import pdb
import requests
import schedule
import spacy
import time
from lxml import etree
def text_info(url):
ret = ""
r = requests.get(url)
content = r.content
html = etree.HTML(content)
text_info_xpath = html.xpath('//html/body/div[3]/div[6]')[0]
for text_s in text_info_xpath.itertext():
ret += text_s
return ret
def get_directory():
ret = []
url = "http://www.mhtwx.la/101/101877/"
r = requests.get(url)
content = r.content
html = etree.HTML(content)
all_directory = html.xpath('//*[@id="novel101877"]/dl')[0]
section = []
chapter_name = ""
for children in all_directory.getchildren():
section_s = {}
if children.tag == "dt":
if section and chapter_name:
ret.append({"chapter": chapter_name, "section": section})
section = []
chapter_name = children.text
else:
section_s["name"] = children.xpath("a")[0].text
section_s["url"] = "http://www.mhtwx.la/101/101877/" + \
children.xpath("a")[0].get("href")
section.append(section_s)
ret.append({"chapter": chapter_name, "section": section})
return ret
def run():
directory = get_directory()
for chapter in directory:
print(chapter["chapter"])
for section in chapter["section"]:
section["text"] = text_info(section["url"])
pdb.set_trace()
if __name__ == "__main__":
run()
"""
[
{
"chapter": "",
"section": [
{
"name": "",
"url": ""
"text": ""
},
{
...
}
]
},
{
...
}
]
"""