这个是有引用其他人的,但是由于时间久远,请原谅我没找到链接。
程序demo:
import urllib
import urllib.parse
from bs4 import BeautifulSoup
def my_align(_string, _length=30, _type='M'):
chinese_symbol = "!@#¥%…&*():“”《》?·。,;【】"
_str_len = len(_string) # 原始字符串长度(汉字算1个长度)
for _char in _string: # 判断字符串内汉字的数量,有一个汉字增加一个长度
if u'\u4e00' <= _char <= u'\u9fa5' or _char in chinese_symbol: # 判断一个字是否为汉字或者中文标点符号
_str_len += 1
_space = _length - _str_len # 计算需要填充的空格数
if _type == 'L': # 根据对齐方式分配空格
_left = 0
_right = _space
elif _type == 'R':
_left = _space
_right = 0
else:
_left = _space // 2
_right = _space - _left
return ' ' * _left + _string + ' ' * _right
dirPath = "D:\\MyProject\\Python\\SpiderCrawler\\"
doc = open(dirPath + 'movie_top250.txt', 'w')
print("豆瓣电影TOP250", file=doc)
print('{}{}{}{}'.format(my_align("评分"), my_align("评分"), my_align("评分人数"), my_align("链接")), file=doc)
for i in range(10):
page = urllib.request.urlopen('http://movie.douban.com/top250?start=' + str(25 * i) + '&filter=')
contents = page.read()
soup = BeautifulSoup(contents, "html.parser")
for tag in soup.find_all('div', class_='info'):
# print tag
m_name = tag.find('span', class_='title').get_text()
m_rating_score = float(tag.find('span', class_='rating_num').get_text())
m_people = tag.find('div', class_="star")
m_span = m_people.findAll('span')
m_peoplecount = m_span[3].contents[0]
m_url = tag.find('a').get('href')
print('{}{}{}{}'.format(my_align(m_name), my_align(str(m_rating_score)), my_align(m_peoplecount),
my_align(m_url)), file=doc)
doc.close()
text的结果: