import xlrd
import xlwt
import difflib
arrayNum = 6
tables = []
newTables = []
def read_excel():
# 打開文件
workbook = xlrd.open_workbook(r'test.xlsx')
# 獲取所有sheet
sheet_name = workbook.sheet_names()[0]
# 根據sheet索引或者名稱獲取sheet內容
sheet = workbook.sheet_by_index(0) # sheet索引從0開始
# sheet = workbook.sheet_by_name('Sheet1')
#print (workboot.sheets()[0])
# sheet的名稱,行數,列數
print (sheet.name,sheet.nrows,sheet.ncols)
# 獲取整行和整列的值(數組)
rows = sheet.row_values(1) # 獲取第2行內容
# cols = sheet.col_values(2) # 獲取第3列內容
#print (rows)
# print (cols)
arr = []
for rown in range(sheet.nrows):
array = {'UID': '', 'ID': '', 'email': '', 'IP': '', 'Login_IP': ''}
array['UID'] = sheet.cell_value(rown,0)
array['ID'] = sheet.cell_value(rown,1)
array['email'] = sheet.cell_value(rown,2)
array['IP'] = sheet.cell_value(rown,3)
array['Login_IP'] = sheet.cell_value(rown,4)
arr.append(array['Login_IP'])
tables.append(array)
#print(len(tables))
# print(tables[1].get('Login_IP'))
#print(arr)
arr_1=[]
for i in range(len(tables)):
# print(tables[i]['Login_IP'])
if i !=0:
num= arr.count(tables[i]['Login_IP'])
if num>1:
#arr.append(tables[i]['Login_IP'])
arr_1.append(tables[i])
# print("過濾後的IP")
# print(arr)
#print("過濾後的數據")
#print(arr_1)
return arr_1
#print (tables[5])
#過濾相似用戶名
def likeNumber():
# 打開文件
workbook = xlrd.open_workbook(r'aa.xlsx')
# 根據sheet索引或者名稱獲取sheet內容
sheet = workbook.sheet_by_index(0) # sheet索引從0開始
# sheet = workbook.sheet_by_name('Sheet1')
# print (workboot.sheets()[0])
# sheet的名稱,行數,列數
# 獲取整行和整列的值(數組)
rows = sheet.row_values(1) # 獲取第2行內容
# cols = sheet.col_values(2) # 獲取第3列內容
# print (rows)
# print (cols)
arr = []
arr_2 =[]
tables_1=[]
for rown in range(sheet.nrows):
array = {'UID': '', 'ID': '', 'email': '', 'IP': '', 'Login_IP': ''}
array['UID'] = sheet.cell_value(rown, 0)
array['ID'] = str(sheet.cell_value(rown, 1))
array['email'] = sheet.cell_value(rown, 2)
array['IP'] = sheet.cell_value(rown, 3)
array['Login_IP'] = sheet.cell_value(rown, 4)
#print(array)
arr.append(array['ID'])
arr_2.append(array['email'])
tables_1.append(array)
# print(len(tables))
# print(tables[1].get('Login_IP'))
# print(arr)
# print(tables)
a_arr=[]
for i in range(len(tables_1)):
if i != 0:
a = difflib.get_close_matches(tables_1[i]['ID'], arr, 100, cutoff=0.7)
if len(a) > 1:
if a[0] != "":
for i in range(len(a)):
print("讀取用戶名:", a[i])
a_arr.append(a[i])
a_arr = list(set(a_arr))
#print (a_arr)
#print (b_arr)
arr_1 = []
for i in range(len(tables_1)):
if i != 0:
if tables_1[i]['ID'] in a_arr:
arr_1.append(tables_1[i])
#print(arr_1)
return arr_1
# print (tables[5])
#過濾相似郵箱
def likeEmail():
# 打開文件
workbook = xlrd.open_workbook(r'aa.xlsx')
# 根據sheet索引或者名稱獲取sheet內容
sheet = workbook.sheet_by_index(0) # sheet索引從0開始
# sheet = workbook.sheet_by_name('Sheet1')
# print (workboot.sheets()[0])
# sheet的名稱,行數,列數
# 獲取整行和整列的值(數組)
rows = sheet.row_values(1) # 獲取第2行內容
# cols = sheet.col_values(2) # 獲取第3列內容
# print (rows)
# print (cols)
arr = []
arr_2 = []
tables_1 = []
for rown in range(sheet.nrows):
array = {'UID': '', 'ID': '', 'email': '', 'IP': '', 'Login_IP': ''}
array['UID'] = sheet.cell_value(rown, 0)
array['ID'] = str(sheet.cell_value(rown, 1))
array['email'] = str(sheet.cell_value(rown, 2))
array['IP'] = sheet.cell_value(rown, 3)
array['Login_IP'] = sheet.cell_value(rown, 4)
# print(array)
arr.append(array['ID'])
arr_2.append(array['email'])
tables_1.append(array)
# print(len(tables))
# print(tables[1].get('Login_IP'))
# print(arr)
# print(tables)
b_arr = []
for i in range(len(tables_1)):
if i != 0:
mStr =tables_1[i]['email']
email_1=mStr.split("@")
b = difflib.get_close_matches(email_1[0], arr_2, 100, cutoff=0.4)
if len(b) > 1:
if b[0] != "":
for i in range(len(b)):
print("讀取郵箱:", b[i])
b_arr.append(b[i])
b_arr = list(set(b_arr))
# print (a_arr)
# print (b_arr)
arr_1 = []
for i in range(len(tables_1)):
if i != 0:
if tables_1[i]['email'] in b_arr:
arr_1.append(tables_1[i])
# print(arr_1)
return arr_1
# print (tables[5])
#寫入數據
class WriteExcel:
# 初始化
def __init__(self, filename, sheet_name):
self.work_book = xlwt.Workbook(encoding="UTF-8")
self.worksheet = self.work_book.add_sheet(sheet_name)
self.filename = filename
self.row = 0
# 保存Excel
def save(self):
self.work_book.save(self.filename)
# 設置樣式
def set_style(self, name, height, bold=False, format_str='', align='center'):
style = xlwt.XFStyle() # 初始化樣式
font = xlwt.Font() # 爲樣式創建字體
font.name = name # 字體
font.bold = bold
font.height = height
borders = xlwt.Borders() # 爲樣式創建邊框
borders.left = 1
borders.right = 1
borders.top = 1
borders.bottom = 1
alignment = xlwt.Alignment() # 設置排列
if align == 'center':
alignment.horz = xlwt.Alignment.HORZ_CENTER
alignment.vert = xlwt.Alignment.VERT_CENTER
elif align == 'left':
alignment.horz = xlwt.Alignment.HORZ_LEFT
alignment.vert = xlwt.Alignment.VERT_BOTTOM
else:
alignment.horz = xlwt.Alignment.HORZ_RIGHT
alignment.vert = xlwt.Alignment.VERT_BOTTOM
style.font = font
style.borders = borders
style.num_format_str = format_str
style.alignment = alignment
return style
# 設置標題的格式
def set_title_style(self):
return self.set_style('黑體', 300, bold=True, format_str='')
# 設置表頭的格式
def set_head_style(self):
head_style = self.set_style('Times New Roman', 220, bold=True, format_str='')
pattern = xlwt.Pattern() # 一個實例化的樣式類
pattern.pattern = xlwt.Pattern.SOLID_PATTERN # 固定的樣式
pattern.pattern_fore_colour = xlwt.Style.colour_map['yellow'] # 背景顏色
head_style.pattern = pattern
return head_style
# 設置明細行的格式
def set_default_style(self):
return self.set_style('Times New Roman', 200, bold=False, format_str='', align='right')
# 添加標題
def add_title(self, title):
self.worksheet.write_merge(0, 0, 0, 2, title, self.set_title_style())
self.row += 1
# 寫入文件頭
def add_head(self, key, value):
# 向單元格中寫入內容
self.worksheet.write(self.row, 0, key)
self.worksheet.write(self.row, 1, value)
self.row += 1
# 寫入明細
def add_list(self, table_head, table_detail):
self.row += 1
for i, value in enumerate(table_head):
self.worksheet.write(self.row, i, value, self.set_head_style())
self.worksheet.col(i).width = 150 * 30
for rows in table_detail:
self.row += 1
for i, key in enumerate(rows):
self.worksheet.write(self.row, i, rows[key])
if __name__ == '__main__':
# 讀取Excel
list_detail= read_excel();
print ('讀取成功')
list_head = ["UID", "ID", "email","IP","Login_IP"]
writeExcel = WriteExcel("aa.xlsx", "統計")
writeExcel.add_title("IP統計表")
writeExcel.add_list(list_head, list_detail)
writeExcel.save()
print ('寫入成功')
print ("=======================")
print("第二次讀取")
likeNumber=likeNumber()
print("用戶名讀取成功")
writeExcel_1 = WriteExcel("用戶名.xlsx", "用戶名相似")
writeExcel_1.add_title("用戶名相似")
writeExcel_1.add_list(list_head, likeNumber)
writeExcel_1.save()
print("用戶名寫入成功")
print ("=======================")
print("第三次讀取")
likeEmail = likeEmail()
print("郵箱讀取成功")
writeExcel_1 = WriteExcel("郵箱.xlsx", "郵箱相似")
writeExcel_1.add_title("郵箱相似")
writeExcel_1.add_list(list_head, likeEmail)
writeExcel_1.save()
print("郵箱讀取成功")
python3.7 過濾出excel中重複數據,模糊匹配出用戶名和郵箱相似數據----例子
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.