python3.7 過濾出excel中重複數據,模糊匹配出用戶名和郵箱相似數據----例子

import xlrd
import xlwt
import difflib
arrayNum = 6
tables = []
newTables = []

def read_excel():
    # 打開文件
    workbook = xlrd.open_workbook(r'test.xlsx')
    # 獲取所有sheet
    sheet_name = workbook.sheet_names()[0]

    # 根據sheet索引或者名稱獲取sheet內容
    sheet = workbook.sheet_by_index(0) # sheet索引從0開始
    # sheet = workbook.sheet_by_name('Sheet1')

    #print (workboot.sheets()[0])
    # sheet的名稱,行數,列數
    print (sheet.name,sheet.nrows,sheet.ncols)

    # 獲取整行和整列的值(數組)
    rows = sheet.row_values(1) # 獲取第2行內容
    # cols = sheet.col_values(2) # 獲取第3列內容
    #print (rows)
    # print (cols)
    arr = []
    for rown in range(sheet.nrows):
       array = {'UID': '', 'ID': '', 'email': '', 'IP': '', 'Login_IP': ''}
       array['UID'] = sheet.cell_value(rown,0)
       array['ID'] = sheet.cell_value(rown,1)
       array['email'] = sheet.cell_value(rown,2)
       array['IP'] = sheet.cell_value(rown,3)
       array['Login_IP'] = sheet.cell_value(rown,4)
       arr.append(array['Login_IP'])
       tables.append(array)
    #print(len(tables))
   # print(tables[1].get('Login_IP'))
    #print(arr)
    arr_1=[]
    for i in range(len(tables)):
       # print(tables[i]['Login_IP'])
        if i !=0:
            num= arr.count(tables[i]['Login_IP'])
            if num>1:
                #arr.append(tables[i]['Login_IP'])
                arr_1.append(tables[i])
   # print("過濾後的IP")
   # print(arr)
    #print("過濾後的數據")
    #print(arr_1)
    return arr_1
    #print (tables[5])
#過濾相似用戶名
def likeNumber():
    # 打開文件
    workbook = xlrd.open_workbook(r'aa.xlsx')


    # 根據sheet索引或者名稱獲取sheet內容
    sheet = workbook.sheet_by_index(0)  # sheet索引從0開始
    # sheet = workbook.sheet_by_name('Sheet1')

    # print (workboot.sheets()[0])
    # sheet的名稱,行數,列數

    # 獲取整行和整列的值(數組)
    rows = sheet.row_values(1)  # 獲取第2行內容
    # cols = sheet.col_values(2) # 獲取第3列內容
    # print (rows)
    # print (cols)
    arr = []
    arr_2 =[]
    tables_1=[]
    for rown in range(sheet.nrows):
        array = {'UID': '', 'ID': '', 'email': '', 'IP': '', 'Login_IP': ''}
        array['UID'] = sheet.cell_value(rown, 0)
        array['ID'] = str(sheet.cell_value(rown, 1))
        array['email'] = sheet.cell_value(rown, 2)
        array['IP'] = sheet.cell_value(rown, 3)
        array['Login_IP'] = sheet.cell_value(rown, 4)
        #print(array)
        arr.append(array['ID'])
        arr_2.append(array['email'])
        tables_1.append(array)
    # print(len(tables))
    # print(tables[1].get('Login_IP'))
    # print(arr)
   # print(tables)

    a_arr=[]
    for i in range(len(tables_1)):
        if i != 0:
            a = difflib.get_close_matches(tables_1[i]['ID'], arr, 100, cutoff=0.7)
            if len(a) > 1:
                if a[0] != "":
                    for i in range(len(a)):
                        print("讀取用戶名:", a[i])
                        a_arr.append(a[i])
    a_arr = list(set(a_arr))
    #print (a_arr)
    #print (b_arr)
    arr_1 = []
    for i in range(len(tables_1)):
        if i != 0:
            if tables_1[i]['ID'] in a_arr:
                arr_1.append(tables_1[i])

    #print(arr_1)
    return arr_1
    # print (tables[5])


#過濾相似郵箱
def likeEmail():
    # 打開文件
    workbook = xlrd.open_workbook(r'aa.xlsx')


    # 根據sheet索引或者名稱獲取sheet內容
    sheet = workbook.sheet_by_index(0)  # sheet索引從0開始
    # sheet = workbook.sheet_by_name('Sheet1')

    # print (workboot.sheets()[0])
    # sheet的名稱,行數,列數

    # 獲取整行和整列的值(數組)
    rows = sheet.row_values(1)  # 獲取第2行內容
    # cols = sheet.col_values(2) # 獲取第3列內容
    # print (rows)
    # print (cols)
    arr = []
    arr_2 = []
    tables_1 = []
    for rown in range(sheet.nrows):
        array = {'UID': '', 'ID': '', 'email': '', 'IP': '', 'Login_IP': ''}
        array['UID'] = sheet.cell_value(rown, 0)
        array['ID'] = str(sheet.cell_value(rown, 1))
        array['email'] = str(sheet.cell_value(rown, 2))
        array['IP'] = sheet.cell_value(rown, 3)
        array['Login_IP'] = sheet.cell_value(rown, 4)
        # print(array)
        arr.append(array['ID'])
        arr_2.append(array['email'])
        tables_1.append(array)
    # print(len(tables))
    # print(tables[1].get('Login_IP'))
    # print(arr)
    # print(tables)

    b_arr = []
    for i in range(len(tables_1)):
        if i != 0:
            mStr =tables_1[i]['email']
            email_1=mStr.split("@")
            b = difflib.get_close_matches(email_1[0], arr_2, 100, cutoff=0.4)
            if len(b) > 1:
                if b[0] != "":
                    for i in range(len(b)):
                        print("讀取郵箱:", b[i])
                        b_arr.append(b[i])

    b_arr = list(set(b_arr))
    # print (a_arr)
    # print (b_arr)
    arr_1 = []
    for i in range(len(tables_1)):
        if i != 0:
            if tables_1[i]['email'] in b_arr:
                arr_1.append(tables_1[i])

    # print(arr_1)
    return arr_1
    # print (tables[5])


#寫入數據
class WriteExcel:

    # 初始化
    def __init__(self, filename, sheet_name):
        self.work_book = xlwt.Workbook(encoding="UTF-8")
        self.worksheet = self.work_book.add_sheet(sheet_name)
        self.filename = filename
        self.row = 0

    # 保存Excel
    def save(self):
        self.work_book.save(self.filename)

    # 設置樣式
    def set_style(self, name, height, bold=False, format_str='', align='center'):
        style = xlwt.XFStyle()  # 初始化樣式
        font = xlwt.Font()  # 爲樣式創建字體
        font.name = name  # 字體
        font.bold = bold
        font.height = height

        borders = xlwt.Borders()  # 爲樣式創建邊框
        borders.left = 1
        borders.right = 1
        borders.top = 1
        borders.bottom = 1

        alignment = xlwt.Alignment()  # 設置排列
        if align == 'center':
            alignment.horz = xlwt.Alignment.HORZ_CENTER
            alignment.vert = xlwt.Alignment.VERT_CENTER
        elif align == 'left':
            alignment.horz = xlwt.Alignment.HORZ_LEFT
            alignment.vert = xlwt.Alignment.VERT_BOTTOM
        else:
            alignment.horz = xlwt.Alignment.HORZ_RIGHT
            alignment.vert = xlwt.Alignment.VERT_BOTTOM

        style.font = font
        style.borders = borders
        style.num_format_str = format_str
        style.alignment = alignment
        return style

    # 設置標題的格式
    def set_title_style(self):
        return self.set_style('黑體', 300, bold=True, format_str='')

    # 設置表頭的格式
    def set_head_style(self):
        head_style = self.set_style('Times New Roman', 220, bold=True, format_str='')
        pattern = xlwt.Pattern()  # 一個實例化的樣式類
        pattern.pattern = xlwt.Pattern.SOLID_PATTERN  # 固定的樣式
        pattern.pattern_fore_colour = xlwt.Style.colour_map['yellow']  # 背景顏色
        head_style.pattern = pattern
        return head_style

    # 設置明細行的格式
    def set_default_style(self):
        return self.set_style('Times New Roman', 200, bold=False, format_str='', align='right')

    # 添加標題
    def add_title(self, title):
        self.worksheet.write_merge(0, 0, 0, 2, title, self.set_title_style())
        self.row += 1

    # 寫入文件頭
    def add_head(self, key, value):
        # 向單元格中寫入內容
        self.worksheet.write(self.row, 0, key)
        self.worksheet.write(self.row, 1, value)
        self.row += 1

    # 寫入明細
    def add_list(self, table_head, table_detail):
        self.row += 1
        for i, value in enumerate(table_head):
            self.worksheet.write(self.row, i, value, self.set_head_style())
            self.worksheet.col(i).width = 150 * 30
        for rows in table_detail:
            self.row += 1
            for i, key in enumerate(rows):
                self.worksheet.write(self.row, i, rows[key])

if __name__ == '__main__':
    # 讀取Excel
    list_detail= read_excel();
    print ('讀取成功')
    list_head = ["UID", "ID", "email","IP","Login_IP"]
    writeExcel = WriteExcel("aa.xlsx", "統計")
    writeExcel.add_title("IP統計表")
    writeExcel.add_list(list_head, list_detail)
    writeExcel.save()
    print ('寫入成功')
    print ("=======================")
    print("第二次讀取")
    likeNumber=likeNumber()
    print("用戶名讀取成功")

    writeExcel_1 = WriteExcel("用戶名.xlsx", "用戶名相似")
    writeExcel_1.add_title("用戶名相似")
    writeExcel_1.add_list(list_head, likeNumber)
    writeExcel_1.save()
    print("用戶名寫入成功")
    print ("=======================")
    print("第三次讀取")
    likeEmail = likeEmail()
    print("郵箱讀取成功")
    writeExcel_1 = WriteExcel("郵箱.xlsx", "郵箱相似")
    writeExcel_1.add_title("郵箱相似")
    writeExcel_1.add_list(list_head, likeEmail)
    writeExcel_1.save()
    print("郵箱讀取成功")
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章