論文:
題目:Personal Information in Passwords and Its Security Implications
鏈接:https://ieeexplore.ieee.org/abstract/document/7931642
實驗重現:
# encoding: utf-8
# author: kaiyouhu
import pandas as pd
import numpy as np
import re
class Passenger:
# passenger_list = []
def __init__(self, login_email, password, name, id_number, username, phone, email):
self.login_email = login_email
self.password = password
self.name = name
self.id_number = id_number
self.username = username
self.phone = phone
self.email = email
# self.passenger_list = []
# def add(self, passenger):
# self.passenger_list.append(passenger)
def read_data(path):
with open(path) as f:
data = f.read()
informations = data.splitlines()
for index in range(len(informations)):
informations[index] = informations[index].split('----')
# print sum of information
informations_sum = len(informations)
print('sum: ' + str(informations_sum))
table1_header = ['RANK', 'Password', 'Amount', 'Percentage']
rank_list = list(range(1, 11))
password_list = ['123456', 'a123456', '123456a', '5201314', '111111',
'woaini1314', 'qq123456', '123123', '000000', '1qaz2wsx']
amount_list1 = []
percentage_list1 = []
for index, password in enumerate(password_list):
count = 0
for information in informations:
if information[1] == password:
count += 1
amount_list1.append(count)
for amount in amount_list1:
percentage_list1.append(float(amount/informations_sum))
result = np.array(list(zip(rank_list, password_list, amount_list1, percentage_list1)), order='C')
df1 = pd.DataFrame(result, columns=table1_header)
print(df1)
structure_list = ['D7', 'D8', 'D6', 'L2D7', 'L3D6', 'L1D7', 'L2D6', 'L3D7', 'D9', 'L2D8']
structure_list_regex = ['^\d{7}$', '^\d{8}$', '^\d{6}$', '^[a-zA-Z]{2}\d{7}$', '^[a-zA-Z]{3}\d{6}$',
'^[a-zA-Z]{1}\d{7}$', '^[a-zA-Z]{2}\d{6}$', '^[a-zA-Z]{3}\d{7}$',
'^\d{9}$', '^[a-zA-Z]{2}\d{8}$']
amount_list2 = []
percentage_list2 = []
for password_index, structure_regex in enumerate(structure_list_regex):
count = 0
for index, information in enumerate(informations):
if re.match(structure_regex, str(information[1])):
count += 1
amount_list2.append(count)
for amount in amount_list2:
percentage_list2.append(float(amount / informations_sum))
result2 = np.array(list(zip(rank_list, structure_list, amount_list2, percentage_list2)), order='C')
df2 = pd.DataFrame(result2, columns=table1_header)
print(df2)
rank_list = list(range(1, 7))
information_type_list = ['Birthdate', 'AccountName', 'Name', 'Email', 'IDNumber', 'CellPhone']
amount_list2 = []
percentage_list2 = []
for index, information_type in enumerate(information_type_list):
count = 0
for information in informations:
if information_type == 'Birthdate':
if information[1].find(information[3][6:14]) != -1:
count += 1
elif information_type == 'AccountName':
if information[1].find(information[4]) != -1:
count += 1
elif information_type == 'Name':
# are you kidding?
if information[1].find(information[4]) != -1:
count += 1
elif information_type == 'Email':
if information[1].find(information[0].split('@')[0]) != -1:
count += 1
elif information_type == 'IDNumber':
if information[4].find(information[1]) != -1:
count += 1
elif information_type == 'CellPhone':
if information[1].find(information[5]) != -1:
count += 1
amount_list2.append(count)
for amount in amount_list2:
percentage_list2.append(float(amount / informations_sum))
result = np.array(list(zip(rank_list, information_type_list, amount_list2, percentage_list2)), order='C')
df1 = pd.DataFrame(result, columns=table1_header)
print(df1)
pass
read_data('../data/12306.txt')
輸出結果:
sum: 131653
RANK Password Amount Percentage
0 1 123456 392 0.0029775242493524645
1 2 a123456 281 0.0021343987603776593
2 3 123456a 165 0.0012532946457733587
3 4 5201314 161 0.0012229117452697623
4 5 111111 157 0.0011925288447661656
5 6 woaini1314 136 0.0010330186171222835
6 7 qq123456 98 0.0007443810623381161
7 8 123123 98 0.0007443810623381161
8 9 000000 97 0.0007367853372122169
9 10 1qaz2wsx 93 0.0007064024367086204
RANK Password Amount Percentage
0 1 D7 10906 0.08283897822305607
1 2 D8 9458 0.0718403682407541
2 3 D6 9102 0.06913629009593401
3 4 L2D7 5073 0.038533113563686355
4 5 L3D6 4832 0.036702543808344666
5 6 L1D7 4778 0.03629237465154611
6 7 L2D6 4275 0.03247172491321884
7 8 L3D7 3885 0.029509392114118176
8 9 D9 3594 0.027299036102481522
9 10 L2D8 3371 0.025605189399406016
RANK Password Amount Percentage
0 1 Birthdate 5726 0.0434931220708985
1 2 AccountName 2565 0.019483034947931303
2 3 Name 2565 0.019483034947931303
3 4 Email 3979 0.030223390275952694
4 5 IDNumber 6835 0.05191678123552065
5 6 CellPhone 89 0.0006760195362050238
Process finished with exit code 0
備註:
1實驗數據自己去百度網盤找下載的,大概14M的txt,共131653條數據(有的版本可能會上下差一些條數,但基本上差不多)
2只重現了第三個表格,前面兩個基本上數據差不多,第三個,我感覺不太理解,也不知道作者具體怎麼實現的(匹配細節不知道),後面的我懶得去編寫輸出了
substring← get_all_substring(pwd)
reverse_length_sort(substring)
for eachstring ∈ substring do
if len(eachstring) ≥ 2 then
if matchbd(eachstring,infolist) then
這裏按照作者代碼的理解思路是,獲取密碼的全部長度大於等於2的子串,然後去和身份證信息,電話號碼等匹配,我就呵呵了
3論文的內容看看就好了,個人感覺過程有點水,結論得到地太草率(本人愚鈍之見,不要太在意)