1、統計目錄下多個txt文件,找出出現頻率最多的單詞
#coding=utf-8
##目錄下多個txt文件,找出出現頻率最多的單詞
import os,re
from collections import Counter
FILESOURECE = "D://pytest"
#過濾詞
stop_word = ['the','in','of','and','to','has','that','s','is','are','a','with','as','an']
def getCounter(articlefileresource):
'tdw'
pattern = r'''[A-Za-z]+|\$?\d+%?$'''
with open(articlefileresource) as f:
r = re.findall(pattern, f.read())
return Counter(r)
def getRun(FILE_PATH):
os.chdir(FILE_PATH)
total_counter=Counter()
print os.listdir(os.getcwd())
for i in os.listdir(os.getcwd()):
if os.path.splitext(i)[1] == '.txt':
total_counter += getCounter(i)
#排除過濾出
for i in stop_word:
total_counter[i] = 0
print total_counter.most_common()[0][0]
print getRun(FILESOURECE)
#if _name_ == '_main_':
#print getRun(FILESOURECE)
2、批量修改某個文件夾下圖片的分辨率
#coding=utf-8
#批量修改某個文件夾下圖片分辨率
import os
from PIL import Image
mypath = "D://pytest/"
outpath = 'D://pycopy/'
def processimage(filesoure, destsource, name, imgtype):
imgtype = 'jpeg' if imgtype == '.jpg' else 'png'
print filesoure+name
#打開圖片
im = Image.open(filesoure+name)
#縮放比例
rate = max(im.size[0]/1180.0 if im.size[0]>1180 else 0, im.size[1]/1774.0 if im.size[1]>1774 else 0)
print '----------'
if rate:
im.thumbnail(im.size[0]/rate, im.size[1]/rate)
im.save(destsource+name, imgtype)
def getRun():
os.chdir(mypath)
for i in os.listdir(os.getcwd()):
#檢查後綴
postfix = os.path.splitext(i)[1]
if postfix == '.jpg' or postfix == '.png':
processimage(mypath,outpath,i,postfix)
getRun()
3、統計一個目錄下所有的py文件代碼行數
#coding=utf-8
#統計一個目錄下所欲py文件的代碼行數
import re,os
def anynaise_code(codefilesource):
'''統計一個py文件的代碼行數'''
total_lines = 0
coments_lines = 0
blank_lines = 0
with open(codefilesource) as f:
lines = f.readlines()
total_lines = len(lines)
line_index = 0
while line_index<total_lines:
line = lines[line_index]
if line.startswith('#'):
coments_lines+=1
#統計空格行數
elif line == '\n':
blank_lines+=1
line_index+=1
print "在%s中:" % codefilesource
print "代碼行數:", total_lines
print "註釋行數:", coments_lines
print "空格行數:", blank_lines
return [total_lines, coments_lines, blank_lines]
def getRun(filepath):
os.chdir(filepath)
total_commentlines = 0
total_blanklines = 0
total_lines = 0
for i in os.listdir(os.getcwd()):
if os.path.splitext(i)[1] == '.py':
line = anynaise_code(i)
total_lines,total_commentlines,total_blanklines = total_lines+line[0],total_commentlines+line[1],total_blanklines+line[2]
print "總的代碼行數:", total_lines
print "總的註釋行數:", total_commentlines
print "總的空格行數:", total_blanklines
getRun("D://pythonworkspace//")
4、使用goose進行抓取一個網頁的正文內容:
這裏首先需要安裝goose庫,使用pip install goose-extractor進行安裝
#coding=utf-8
from goose import Goose
from goose.text import StopWordsChinese
def gooseExample():
g = Goose()
url = "http://www.chinadaily.com.cn/a/201712/22/WS5a3c7473a31008cf16da2d9e.html"
article = g.extract(url=url)
print(article.title)
print(article.cleaned_text[:150])
def gooseChineseExample():
g = Goose({'stopwords_class': StopWordsChinese})
url = "https://item.btime.com/36a0f17i0489keqltn35q96p4lr?from=haozcxw"
article = g.extract(url=url)
print(article.title)
print(article.meta_description)
print(article.cleaned_text[:150])
if __name__ == '__main__':
#begin_insert_job("knowledge", "person", "../data/Person.json")
gooseExample()
gooseChineseExample()