forked from Open-CT/openbrain
367 lines
17 KiB
Python
367 lines
17 KiB
Python
# ==========================================================================
|
||
# 处理过程性数据获取学生特征值 - step1
|
||
# 本脚本以采集到的【已排序的】学生作答过程性作为待处理的原始数据文档,对其进行预处理
|
||
# 预处理完成后,新的文档会在原始数据文档的基础上新增6列,作为下一步的处理对象
|
||
#
|
||
# 新增信息:
|
||
# page:学生可能会连续在同一页面上执行一系列动作,当结束在同一页面上的操作并切换到另
|
||
# 一页面后,会在切换后的第一个页面的动作行的page列标注刚刚离开的页面页数,
|
||
# 如果停留时间(见pagetime)小于1秒则不标注
|
||
# pagetime:同上,会在pagetime列标注在刚刚离开的页面上的停留时长,精确到1秒,如果
|
||
# 停留时间小于1秒则不标注
|
||
# edit:如果学生执行“编辑”操作则标注为1,否则为0;“编辑”定义为修改选择题的答案或
|
||
# 填空题的一个连续删除序列的结束(连续删除序列后增加字符、或者切换到下一题)
|
||
# before:学生从拿到一个新页面到在页面上首次作答之间的时间差,在每页的第一次修改行
|
||
# 标注该行和上一行的时间差,精确到0.1秒
|
||
# after:学生完成一个页面的作答到切换到下一页之间的时间差,在每页的第一次访问行标注
|
||
# 该行和上一行的时间差,精确到0.1秒
|
||
# AR:学生完成一页题目后,已经进入了下一页面、又返回修改已完成的页面。如果该编辑动作
|
||
# 属于这种情况,则标注为1,否则为0
|
||
# ==========================================================================
|
||
# Attention!!!
|
||
# 由于判断某行是否为一个修改行,需要将它和前后行进行对比,即参考3行-->计算得出中间1行
|
||
# 因此在遍历每行时,计算的均为上一行的edit、before、after、AR值,最终结果需将这四
|
||
# 个列表向前循环一位
|
||
# ==========================================================================
|
||
from operator import le
|
||
from tracemalloc import stop
|
||
import pandas as pd
|
||
from datetime import date, datetime
|
||
import json
|
||
|
||
def caltime(date1, date2):
|
||
""" 计算两个datatime类型时间点之间的时间长度,长度以秒为单位
|
||
:param date1: 开始时间
|
||
:param date2: 结束时间
|
||
|
||
"""
|
||
if (date1 == '' or date2 == ''):
|
||
return ""
|
||
datetimeFormat = '%Y-%m-%dT%H:%M:%S.%f+08:00' # 可能存在的两种时间格式,均需要进行处理
|
||
datetimeFormat2 = '%Y-%m-%dT%H:%M:%S+08:00'
|
||
try:
|
||
d1 = datetime.strptime(str(date1), datetimeFormat)
|
||
except ValueError:
|
||
d1 = datetime.strptime(str(date1), datetimeFormat2)
|
||
try:
|
||
d2 = datetime.strptime(str(date2), datetimeFormat)
|
||
except ValueError:
|
||
d2 = datetime.strptime(str(date2), datetimeFormat2)
|
||
delta = d2 - d1 # 两个datetime类相减,得到一个deltatime类的对象
|
||
miao = delta.seconds
|
||
if (miao == 0):
|
||
miao = ""
|
||
return miao
|
||
|
||
data = pd.read_excel(r'A_demo.xlsx') # 输入文档,要求是完整的已排序的过程性数据
|
||
|
||
# 以下为定义的全局变量
|
||
old_id = "" # 上一行的id
|
||
old_task_name = "" # 上一行的任务名称
|
||
oldold_answer_dict = {} # 上上一行、其打开页面上的作答结果
|
||
old_answer_dict = {} # 上一行、其打开页面上的作答结果
|
||
oldold_page = 0 # 上上一行、其打开的页面页数
|
||
old_page = 0 # 上一行、其打开的页面页数
|
||
oldold_time = "" # 上上一行、其动作的时间戳
|
||
old_time = "" # 上一行、其动作的时间戳
|
||
starttime = "" # 这两个time值用于保存计算页面访问时间的值
|
||
stoptime = ""
|
||
max_page = 0 # 由于计算AR列需要判断当前行是否为一个返回已完成页面的动作,即当前行的页面号是否小于已访问过的最大序号
|
||
before_tag = [] # 由于计算before列需要获得每页的第一次修改行,因此用该变量标注当前学生已修改过的页面页数
|
||
|
||
# 以下为最终写入新表格的新列数据
|
||
page_list = []
|
||
page_time_list = []
|
||
edit_list = []
|
||
AR_list = []
|
||
before_list = []
|
||
after_list = []
|
||
|
||
|
||
|
||
for index, row in data.iterrows():
|
||
# 开始按行遍历表格
|
||
print(index)
|
||
# 提取每行的重要信息
|
||
id = str(row['ticket_id'])
|
||
task_id = str(row["task_id"])
|
||
task_name = str(row["task_name"])
|
||
timestamp = str(row["timestamp"])
|
||
answer = json.loads(row['task_answer'])
|
||
frame = answer["frame"]
|
||
answer_dict = {}
|
||
if (frame != None):
|
||
# 若该行中能提取出作答信息,则提取作答结果数据,反之该行可能是用于分隔不同学生的null列
|
||
dataa = frame["data"]
|
||
page = int(dataa["page"])
|
||
# 以下代码段用于更新计算每页停留时间的starttime变量和stoptime变量,并写入page和pagetime列
|
||
if ((old_id != id) or (old_task_name != task_name)):
|
||
print("###")
|
||
starttime = timestamp
|
||
page_list.append("")
|
||
page_time_list.append("")
|
||
elif (page != 1 and old_page != page):
|
||
print("&&&")
|
||
stoptime = timestamp
|
||
delta = caltime(starttime, stoptime)
|
||
starttime = timestamp
|
||
page_list.append(old_page)
|
||
page_time_list.append(delta)
|
||
else:
|
||
print("@@@")
|
||
page_list.append("")
|
||
page_time_list.append("")
|
||
# 以下代码段处理具体的学生作答结果,写入edit、before、AR列
|
||
answer = dataa["answer"]
|
||
answer_list = list(answer)
|
||
### 以下代码段从学生的全部作答中,提取当前操作页面的作答结果,保存入全局变量answer_dict
|
||
if (task_name == "热身题【本题不计入总分】"):
|
||
# 热身题
|
||
pass
|
||
elif (task_name == "运动会问题"):
|
||
# 运动会问题
|
||
if (page == 1):
|
||
answer_dict['P3'] = answer_list[0]
|
||
elif (page == 2):
|
||
answer_dict['MM60311'] = answer_list[1]
|
||
elif (page == 3):
|
||
answer_dict['MM60321'] = answer_list[2]
|
||
elif (page == 4):
|
||
answer_dict['MM60331'] = answer_list[3]
|
||
elif (page == 5):
|
||
answer_dict['MM60341_wang'] = answer_list[4]
|
||
answer_dict['MM60341_ming'] = answer_list[5]
|
||
answer_dict['MM60341_zhang'] = answer_list[6]
|
||
answer_dict['MM60341_li'] = answer_list[7]
|
||
answer_dict['MM60341_hua'] = answer_list[8]
|
||
elif (page == 6):
|
||
answer_dict['MM60351'] = answer_list[9]
|
||
elif (task_name == "生活水平问题"):
|
||
# 生活水平问题
|
||
if (page == 1):
|
||
answer_dict['P4'] = answer_list[0]
|
||
elif (page == 2):
|
||
answer_dict['MM60411'] = answer_list[1]
|
||
elif (page == 3):
|
||
answer_dict['MM60421'] = answer_list[2]
|
||
elif (page == 4):
|
||
answer_dict['MM60431'] = answer_list[3]
|
||
answer_dict['MM60432'] = answer_list[4]
|
||
elif (page == 5):
|
||
answer_dict['MM60441'] = answer_list[5]
|
||
answer_dict['MM60442'] = answer_list[6]
|
||
elif (page == 6):
|
||
answer_dict['MM60451_1'] = answer_list[7]
|
||
answer_dict['MM60451_2'] = answer_list[8]
|
||
elif (page == 7):
|
||
answer_dict['MM60461_1'] = answer_list[9]
|
||
answer_dict['MM60461_2'] = answer_list[10]
|
||
answer_dict['MM60461_3'] = answer_list[11]
|
||
answer_dict['MM60461_4'] = answer_list[12]
|
||
# 以下处理判断【当前处理行的【上一行】】是否为修改动作,是否为edit,判断逻辑为:
|
||
# ========================================================================
|
||
# 从当前的page要向前看两页old_page和oldold_page,判断old_page页为修改页的条件:
|
||
# old_page相对oldold,页数不变
|
||
# old_page为选择题页
|
||
# 要求出现的-1数目没有改变
|
||
# old_page为填空题页
|
||
# old_page相对oldold,字符串长度缩短且
|
||
# page相对old_page,页数不变且字符串长度增加
|
||
# page相对old_page,页数改变
|
||
# ========================================================================
|
||
# NOTES:由于具体每页的题目数量和类型均不同,因此难以整合出统一的处理方法(函数),
|
||
# 目前直接对每题、每页进行单独判断并处理,后期可优化
|
||
if (old_page != oldold_page):
|
||
edit_list.append("0")
|
||
before_list.append("")
|
||
if (max_page < old_page):
|
||
time_tmp = caltime(oldold_time, old_time)
|
||
if (time_tmp == ""):
|
||
time_tmp = 0
|
||
after_list.append(time_tmp)
|
||
else:
|
||
after_list.append("")
|
||
else:
|
||
print(before_tag)
|
||
after_list.append("")
|
||
if (old_page not in before_tag):
|
||
before_tag.append(old_page)
|
||
time_tmp = caltime(oldold_time, old_time)
|
||
if (time_tmp == ""):
|
||
time_tmp = 0
|
||
before_list.append(time_tmp)
|
||
else:
|
||
before_list.append("")
|
||
if (old_task_name == "运动会问题"):
|
||
if (old_page in [1, 2, 3, 6]):
|
||
old_answer = str(list(old_answer_dict.values()))
|
||
oldold_answer = str(list(oldold_answer_dict.values()))
|
||
if (old_answer.count("-1") == oldold_answer.count("-1")):
|
||
edit_list.append("1")
|
||
else:
|
||
edit_list.append("0")
|
||
elif (old_page == 4):
|
||
if (len(old_answer_dict["MM60331"]) < len(oldold_answer_dict["MM60331"])):
|
||
if (page != old_page):
|
||
edit_list.append("1")
|
||
elif (page == old_page and len(old_answer_dict["MM60331"]) < len(answer_dict["MM60331"])):
|
||
edit_list.append("1")
|
||
else:
|
||
edit_list.append("0")
|
||
else:
|
||
edit_list.append("0")
|
||
elif (old_page == 5):
|
||
old_answer = old_answer_dict["MM60341_wang"] + old_answer_dict["MM60341_ming"] + old_answer_dict["MM60341_zhang"] + old_answer_dict["MM60341_li"] + old_answer_dict["MM60341_hua"]
|
||
oldold_answer = oldold_answer_dict["MM60341_wang"] + oldold_answer_dict["MM60341_ming"] + oldold_answer_dict["MM60341_zhang"] + oldold_answer_dict["MM60341_li"] + oldold_answer_dict["MM60341_hua"]
|
||
if (len(old_answer) < len(oldold_answer)):
|
||
if (page != old_page):
|
||
edit_list.append("1")
|
||
elif (page == old_page):
|
||
now_answer = answer_dict["MM60341_wang"] + answer_dict["MM60341_ming"] + answer_dict["MM60341_zhang"] + answer_dict["MM60341_li"] + answer_dict["MM60341_hua"]
|
||
if (len(old_answer) < len(now_answer)):
|
||
edit_list.append("1")
|
||
else:
|
||
edit_list.append("0")
|
||
else:
|
||
edit_list.append("0")
|
||
else:
|
||
edit_list.append("0")
|
||
else:
|
||
edit_list.append("0")
|
||
elif (old_task_name == "生活水平问题"):
|
||
if (old_page in [1, 2, 3, 7]):
|
||
old_answer = str(list(old_answer_dict.values()))
|
||
oldold_answer = str(list(oldold_answer_dict.values()))
|
||
if (old_answer.count("-1") == oldold_answer.count("-1")):
|
||
edit_list.append("1")
|
||
else:
|
||
edit_list.append("0")
|
||
elif (old_page == 4):
|
||
old_answer = old_answer_dict["MM60431"] + old_answer_dict["MM60432"]
|
||
oldold_answer = oldold_answer_dict["MM60431"] + oldold_answer_dict["MM60432"]
|
||
if (len(old_answer) < len(oldold_answer)):
|
||
if (page != old_page):
|
||
edit_list.append("1")
|
||
elif (page == old_page):
|
||
now_answer = answer_dict["MM60431"] + answer_dict["MM60432"]
|
||
if (len(old_answer) < len(now_answer)):
|
||
edit_list.append("1")
|
||
else:
|
||
edit_list.append("0")
|
||
else:
|
||
edit_list.append("0")
|
||
else:
|
||
edit_list.append("0")
|
||
elif (old_page == 5):
|
||
old_answer = old_answer_dict["MM60441"] + old_answer_dict["MM60442"]
|
||
oldold_answer = oldold_answer_dict["MM60441"] + oldold_answer_dict["MM60442"]
|
||
if (len(old_answer) < len(oldold_answer)):
|
||
if (page != old_page):
|
||
edit_list.append("1")
|
||
elif (page == old_page):
|
||
now_answer = answer_dict["MM60441"] + answer_dict["MM60442"]
|
||
if (len(old_answer) < len(now_answer)):
|
||
edit_list.append("1")
|
||
else:
|
||
edit_list.append("0")
|
||
else:
|
||
edit_list.append("0")
|
||
else:
|
||
edit_list.append("0")
|
||
elif (old_page == 6):
|
||
old_answer = old_answer_dict["MM60451_1"] + old_answer_dict["MM60451_2"]
|
||
oldold_answer = oldold_answer_dict["MM60451_1"] + oldold_answer_dict["MM60451_2"]
|
||
if (len(old_answer) < len(oldold_answer)):
|
||
if (page != old_page):
|
||
edit_list.append("1")
|
||
elif (page == old_page):
|
||
now_answer = answer_dict["MM60451_1"] + answer_dict["MM60451_2"]
|
||
if (len(old_answer) < len(now_answer)):
|
||
edit_list.append("1")
|
||
else:
|
||
edit_list.append("0")
|
||
else:
|
||
edit_list.append("0")
|
||
else:
|
||
edit_list.append("0")
|
||
else:
|
||
edit_list.append("0")
|
||
else:
|
||
edit_list.append("0")
|
||
edit = int(edit_list[-1])
|
||
# 以下代码:写入AR列,满足AR列的前提条件是一个edit列
|
||
# 满足AR列的条件:(edit==1) && (该行的页面序号小于最大已访问页面序号)
|
||
if (max_page < old_page):
|
||
max_page = old_page
|
||
if (edit == 1 and old_page < max_page):
|
||
AR_list.append("1")
|
||
else:
|
||
AR_list.append("0")
|
||
# 以下代码:常规动作,更新全局变量
|
||
oldold_page = old_page
|
||
old_page = page
|
||
oldold_time = old_time
|
||
old_time = timestamp
|
||
else:
|
||
# 若该行是无内容null页,用于分隔不同的学生作答
|
||
# 以下代码:此时可以重新初始化各个每个学生特有的变量
|
||
page = 0
|
||
before_tag = []
|
||
max_page = 0
|
||
# 以下代码:如果改变了用户id或task_name,则当前时间设为starttime,即第一道题的开始时间
|
||
if ((old_id != id) or (old_task_name != task_name)):
|
||
print("###")
|
||
starttime = timestamp
|
||
# 以下代码:向写入新列中填充合适的空值
|
||
page_list.append("")
|
||
page_time_list.append("")
|
||
edit_list.append("0")
|
||
AR_list.append("0")
|
||
before_list.append("")
|
||
# 以下代码:但针对after列,切换了用户则必然代表切换到新页,因此向after列中填充合适的空值
|
||
time_tmp = caltime(oldold_time, old_time)
|
||
if (time_tmp == ""):
|
||
time_tmp = 0
|
||
after_list.append(time_tmp)
|
||
# 以下代码:常规动作,更新全局变量
|
||
oldold_time = old_time
|
||
old_time = timestamp
|
||
oldold_page = old_page
|
||
old_page = page
|
||
# 以下代码:常规动作,更新全局变量
|
||
old_id = id
|
||
old_task_name = task_name
|
||
oldold_answer_dict = old_answer_dict
|
||
old_answer_dict = answer_dict
|
||
|
||
# 以下代码:将四个列表向前循环一位
|
||
x = edit_list.pop(0)
|
||
edit_list.append(x)
|
||
|
||
x = AR_list.pop(0)
|
||
AR_list.append(x)
|
||
|
||
x = before_list.pop(0)
|
||
before_list.append(x)
|
||
|
||
x = after_list.pop(0)
|
||
after_list.append(x)
|
||
|
||
# 以下代码:将6个新列表写入原始pandas数据
|
||
col_name = data.columns.tolist()
|
||
|
||
col_name.insert(col_name.index('task_answer')+1, 'AR')
|
||
col_name.insert(col_name.index('task_answer')+1, 'after')
|
||
col_name.insert(col_name.index('task_answer')+1, 'before')
|
||
col_name.insert(col_name.index('task_answer')+1, 'edit')
|
||
col_name.insert(col_name.index('task_answer')+1, 'pagetime')
|
||
col_name.insert(col_name.index('task_answer')+1, 'page')
|
||
data = data.reindex(columns=col_name)
|
||
data['AR'] = AR_list
|
||
data['after'] = after_list
|
||
data['before'] = before_list
|
||
data['edit'] = edit_list
|
||
data['pagetime'] = page_time_list
|
||
data['page'] = page_list
|
||
|
||
data.to_excel('A_demo_out.xlsx') # 数据导出 |