這個是有引用其他人的,但是由於時間久遠,請原諒我沒找到鏈接。
程序demo:
import urllib
import urllib.parse
from bs4 import BeautifulSoup
def my_align(_string, _length=30, _type='M'):
chinese_symbol = "!@#¥%…&*():“”《》?·。,;【】"
_str_len = len(_string) # 原始字符串長度(漢字算1個長度)
for _char in _string: # 判斷字符串內漢字的數量,有一個漢字增加一個長度
if u'\u4e00' <= _char <= u'\u9fa5' or _char in chinese_symbol: # 判斷一個字是否爲漢字或者中文標點符號
_str_len += 1
_space = _length - _str_len # 計算需要填充的空格數
if _type == 'L': # 根據對齊方式分配空格
_left = 0
_right = _space
elif _type == 'R':
_left = _space
_right = 0
else:
_left = _space // 2
_right = _space - _left
return ' ' * _left + _string + ' ' * _right
dirPath = "D:\\MyProject\\Python\\SpiderCrawler\\"
doc = open(dirPath + 'movie_top250.txt', 'w')
print("豆瓣電影TOP250", file=doc)
print('{}{}{}{}'.format(my_align("評分"), my_align("評分"), my_align("評分人數"), my_align("鏈接")), file=doc)
for i in range(10):
page = urllib.request.urlopen('http://movie.douban.com/top250?start=' + str(25 * i) + '&filter=')
contents = page.read()
soup = BeautifulSoup(contents, "html.parser")
for tag in soup.find_all('div', class_='info'):
# print tag
m_name = tag.find('span', class_='title').get_text()
m_rating_score = float(tag.find('span', class_='rating_num').get_text())
m_people = tag.find('div', class_="star")
m_span = m_people.findAll('span')
m_peoplecount = m_span[3].contents[0]
m_url = tag.find('a').get('href')
print('{}{}{}{}'.format(my_align(m_name), my_align(str(m_rating_score)), my_align(m_peoplecount),
my_align(m_url)), file=doc)
doc.close()
text的結果: