有五個章節的word作業需要修改,問題在於每個同學都有自己的風格,如下:
-
大多數正常的答案(不同的同學可能位置不在圖中所示的位置)
-
不省心的同學1
-
不省心的同學2
-
有橫線的選項(情況一)
-
有橫線的選項(情況二)
-
有橫線的選項(情況三)
-
有橫線的選項(情況四)
-
有橫線的選項(情況五)
當然還包括該同學這道題根本沒有選答案,列舉的情況並不包括所有的情況,故編碼起來很繁瑣,得同時考慮到很多種情況。
解決方法
一開始我只看到了有橫線的情況,所以我的方法是:
- 方法一
用正則表達式去匹配ABCD答案,這樣的方法在有橫線的(一、三、四、五)情況中都不滿足,只能說是天真了。
regu_lin_ans_lin = r"_[A-D]_"
pt_l_a_l = re.compile(regu_lin_ans_lin)
若繼續加上匹配 [A-D] 和 [A-D] 的情況也只是能匹配多一點的情況了。
- 方法二
既然不能滿足橫線之外,那麼現在我們只要檢測到橫線就能檢測到答案ABCD了,所以分爲兩步:
1、先去檢測橫線;
2、然後檢測當前行是否存在ABCD(可能會存在誤差,因爲題目中也可能涉及ABCD)
regu_ans = '[A-D]'
pt_a = re.compile(regu_ans)
text = para.text
if '_' in text:
ans = pt_a.findall(text)
if len(ans) == 0:
answers.append(-1) # if not detected marked as -1
is_mark = True
continue
ans = ans[0]
tmp = ch2num(ans)
answers.append(tmp)
index += 1
- 方法三
先檢測出:第X題
然後去尋找給出的選項答案ABCD,當然應該在題目所給的A.xxxxx選項之前的ABCD字母。
這裏就涉及到若某一題目根本沒有做的情況。
doc文檔操作
text = "在Excel中,下面對於自定義自動篩選說法中不正確的是" # ___C_____。
lin_ans_lin = r"_[A-D]_"
pt = re.compile(lin_ans_lin)
lin_ans_lin = pt.findall(text)
print(lin_ans_lin)
f = open('第五章數據管理與分析-171xxxx-xxx.docx', 'rb')
doc = Document(f)
print(doc)
index = 1
for para in doc.paragraphs:
print(str(index) + '段\t' + para.text)
index += 1
excel文檔操作
將成績填入對應的位置
import xlrd
import os
import re
from xlutils.copy import copy
excel_file_path = 'excel/課堂測試成績.xlsx'
# step 1: 複製並且獲取第一個sheet
data = xlrd.open_workbook(excel_file_path)
# 拷貝一份原來的excel
book_new = copy(data)
sheet_new = book_new.get_sheet(0)
table = data.sheets()[0]
print(table.nrows)
# step 2: 建立學號與行號的對應,方便後續的操作
dic_stuid_row_num = {}
for i in range(table.nrows):
if i == 0:
continue
text = table.row_values(i)
dic_stuid_row_num[text[0]] = i
# print(dic_stuid_row_num)
# step 3: 遍歷文件,將對應的成績填入excel
re_num = r"\d+"
pt_num = re.compile(re_num)
files = ['chapter2.txt', 'chapter3.txt', 'chapter4.txt', 'chapter5.txt']
sub_nums = [32, 45, 30, 30]
base_dir = 'files'
for index in range(len(files)):
path = os.path.join(base_dir, files[index])
with open(path, 'r', encoding='UTF-8') as f:
lines = f.readlines()
for line in lines:
rs = pt_num.findall(line)
print(rs)
stu_id = rs[0]
score = int(rs[1]) / sub_nums[index]
if stu_id in dic_stuid_row_num.keys():
row_num = dic_stuid_row_num[stu_id]
sheet_new.write(row_num, index + 3, '%.2f' % (score * 100))
book_new.save('課堂測試成績.xls')
批改試卷代碼
# 計算正確的答案個數(需事先給出答案)
import os
import re
from python_docx_tutorial.ans_extractor import extract
from python_docx_tutorial.score_counter import sc_count
if __name__ == '__main__':
result = {}
marked_file = []
# 章節
# 第五章
# true_answers = [1, 4, 4, 2, 3, 2, 4, 3, 4, 4,
# 2, 3, 3, 4, 3, 3, 1, 1, 3, 4,
# 3, 3, 3, 2, 2, 1, 4, 3, 4, 4]
# base_dir = 'C:\\Users\\lenovo02\\Documents\\WeChat Files\\Zipcoder\\Files\\第五章\\學生提交'
# sub_num = 30
# 第四章
# true_answers = [2, 1, 4, 2, 4, 1, 4, 2, 1, 4,
# 1, 2, 2, 1, 3, 1, 3, 4, 4, 4,
# 1, 3, 1, 2, 3, 4, 2, 3, 3, 2]
# base_dir = 'G:\\test\\課堂測試\\第四章\\課堂測試-學生提交'
# sub_num = 30
# 第三章 (多選題和填空題直接給分,不然要加太多的事物邏輯)
# true_answers = [-1, 2, -1, 2, -1, -1, 2, 1, 1, 1,
# -1, 1, 2, 3, 1, 2, -1, 1, -1, -1,
# 1, 2, 2, -1, 1, 1, 1, 1, -1, 1,
# 2, 1, 1, 2, 2, 2, 1, 1, 1, 1,
# 2, 2, 3, -1, 4]
# base_dir = 'G:\\test\\課堂測試\\第三章\\學生提交-課堂測試'
# sub_num = 45
# 第二章 (和第三章一樣,多選填空直接給分,由於選項太多所以最後的幾道題全不給分)
true_answers = [1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1, 1, 2, 2, 2, 2, 3, 1, 4, 4,
-1, -1, -1, -1, -1, -2, -2, -2, -2, -2,
-2, -2]
base_dir = 'G:\\test\\課堂測試\\第二章\\提交'
sub_num = 32
# 正則表達式初始化
regu_stu_id = r"[0-9]"
pt_stu_id = re.compile(regu_stu_id)
sec_dirs_files = os.listdir(base_dir)
for filename in sec_dirs_files:
pathname = os.path.join(base_dir, filename)
if os.path.isdir(pathname):
# true file name
file_name = os.listdir(pathname)[0]
file_path = os.path.join(pathname, file_name)
# print(file_path.title())
stu_id = pt_stu_id.findall(file_name)
stu_id = ''.join(stu_id)
with open(file_path, 'rb') as f:
try:
[answers, marked] = extract(f, sub_num)
except Exception as e:
marked_file.append(file_path)
continue
# assert for the num
# if len(answers) != sub_num:
# print('not correctly detect the subject num! answers number {}'.format(len(answers)))
if marked:
marked_file.append(file_path)
continue
if len(answers) <= sub_num - 10:
print('有答案題目少於給定閾值,爲{}!'.format(len(answers)))
# count the score
score = sc_count(true_answers, answers)
print('學號:{},成績:{}'.format(stu_id, score))
result[stu_id] = score # can be score / sub_num
else:
file_path = os.path.join(base_dir, filename)
stu_id = pt_stu_id.findall(filename)
stu_id = ''.join(stu_id)
with open(file_path, 'rb') as f:
[answers, marked] = extract(f, sub_num)
# assert for the num
# if len(answers) != sub_num:
# print('not correctly detect the subject num! answers number {}'.format(len(answers)))
if marked:
marked_file.append(file_path)
if len(answers) <= sub_num - 10:
print('有答案題目少於給定閾值,爲{}!'.format(len(answers)))
# count the score
score = sc_count(true_answers, answers)
print('學號:{},成績:{}'.format(stu_id, score))
result[stu_id] = score # can be score / sub_num
rs_index = sorted(result.keys())
print('有成績的人數:{}'.format(len(result)))
print(result)
for r in rs_index:
print('學號:{}, 成績:{}'.format(r, result[r]))
print('標註文檔數目:{}'.format(len(marked_file)))
for f in marked_file:
print(f)
- ans_extractor.py
from docx import Document
import re
def ch2num(ch):
"""
轉化爲對應的數字,方便後續計算
:param ch:
:return:
"""
dic = {'A': 1, 'B': 2, 'C': 3, 'D': 4}
return dic[ch]
def extract(f, total_sub):
"""
:param f: 文件(已打開文件)
:param total_sub: 總共的題目數量
:return:
"""
# save ans for every one
answers = []
is_mark = False
# step 1: init re lib
regu_sub = r"第\d+題"
regu_num = r"\d+"
regu_lin_ans_lin = r"_[A-D]"
regu_ans = '[A-D]'
pt_l_a_l = re.compile(regu_lin_ans_lin)
pt_a = re.compile(regu_ans)
pt_sub = re.compile(regu_sub)
pt_sub_num = re.compile(regu_num)
# step 2: search the doc for answers like a b c d
print('開始讀取文檔:{}'.format(f.name))
doc = Document(f)
index = 0
is_find_sub = False
is_find_ans = False
sub_num = 1
for para in doc.paragraphs:
# 由於個個人的文檔不規範,這裏強制使用另外的蠢方法
# lin_ans_lin = pt_l_a_l.findall(para.text)
# print(para.text)
# if len(lin_ans_lin) == 1: # if one answers catch
# index += 1
# ans = pt_a.findall(lin_ans_lin[0])[0]
# tmp = ch2num(ans)
# answers.append(tmp)
# # print the index and corresponding answer
# # print('抽取到第{}答案:{}'.format(index, ans))
# 蠢方法1:(判斷有橫線,然後提取ABCD)
# text = para.text
# if '_' in text:
# ans = pt_a.findall(text)
# if len(ans) == 0:
# answers.append(-1) # if not detected marked as -1
# is_mark = True
# continue
# ans = ans[0]
# tmp = ch2num(ans)
# answers.append(tmp)
# index += 1
# 蠢方法2:找到模型: "第[0-9]題", 然後找到第一個出現的字母
text = para.text
# print(text)
if text.strip().startswith('A'): # if reach the A........ then stop to find the answer
is_find_sub = False
subject = pt_sub.findall(text)
if len(subject) == 1: # find subject
is_find_sub = True
sub_num = pt_sub_num.findall(subject[0])
if not is_find_ans and len(answers) != 0: # if not find the corresponding answer set -1 instead
answers.append(-1)
if is_find_sub:
ans = pt_a.findall(text)
if len(ans) >= 1:
# if index == sub_num:
ans = ans[-1]
# print('題目:{},檢測序號:{}, 答案:{}'.format(sub_num, index + 1, ans))
tmp = ch2num(ans)
answers.append(tmp)
is_find_sub = False
is_find_ans = True
index += 1
# step 3:
if index >= total_sub - 10: # if the detected num is reach a level , not to review
is_mark = False
if is_mark:
print('mark, 文件名:{}'.format(f.name))
# step 4: judge the index num and given total subject num
if index >= total_sub - 10:
print("抽取完畢,個數爲:{}".format(index))
return [answers, False]
else:
print("題目數量差太多,檢查文檔!")
print('檢測到的題目個數爲:{}'.format(len(answers)))
return [answers, True]
if __name__ == '__main__':
file = open('C:\\Users\\lenovo02\\Documents\\WeChat Files\\Zipcoder\\Files\\第五章\\學生提交\\F110_192.168.117.110\\18120318葉宜寧.docx', 'rb')
extract(file, 30)
- score_counter.py
def sc_count(a, b):
"""
:param a: 基準
:param b: 待測
:return:
"""
count = 0
for i in range(len(b)):
if a[i] == -1: # 標註-1 直接給分
count += 1
continue
if a[i] == -2: # 標註-2 直接pass
continue
if b[i] == -1:
continue
else:
# print(i)
count += 1 if a[i] == b[i] else 0
return count
代碼地址
https://github.com/finepix/py_workspace/tree/master/python_docx_tutorial