openbrain/pydata-han/Process/preprocess_processdata_A.py

367 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# ==========================================================================
# 处理过程性数据获取学生特征值 - step1
# 本脚本以采集到的【已排序的】学生作答过程性作为待处理的原始数据文档,对其进行预处理
# 预处理完成后新的文档会在原始数据文档的基础上新增6列作为下一步的处理对象
#
# 新增信息:
# page学生可能会连续在同一页面上执行一系列动作当结束在同一页面上的操作并切换到另
# 一页面后会在切换后的第一个页面的动作行的page列标注刚刚离开的页面页数
# 如果停留时间见pagetime小于1秒则不标注
# pagetime同上会在pagetime列标注在刚刚离开的页面上的停留时长精确到1秒如果
# 停留时间小于1秒则不标注
# edit如果学生执行“编辑”操作则标注为1否则为0“编辑”定义为修改选择题的答案或
# 填空题的一个连续删除序列的结束(连续删除序列后增加字符、或者切换到下一题)
# before学生从拿到一个新页面到在页面上首次作答之间的时间差在每页的第一次修改行
# 标注该行和上一行的时间差精确到0.1秒
# after学生完成一个页面的作答到切换到下一页之间的时间差在每页的第一次访问行标注
# 该行和上一行的时间差精确到0.1秒
# AR学生完成一页题目后已经进入了下一页面、又返回修改已完成的页面。如果该编辑动作
# 属于这种情况则标注为1否则为0
# ==========================================================================
# Attention!!!
# 由于判断某行是否为一个修改行需要将它和前后行进行对比即参考3行-->计算得出中间1行
# 因此在遍历每行时计算的均为上一行的edit、before、after、AR值最终结果需将这四
# 个列表向前循环一位
# ==========================================================================
from operator import le
from tracemalloc import stop
import pandas as pd
from datetime import date, datetime
import json
def caltime(date1, date2):
""" 计算两个datatime类型时间点之间的时间长度长度以秒为单位
:param date1: 开始时间
:param date2: 结束时间
"""
if (date1 == '' or date2 == ''):
return ""
datetimeFormat = '%Y-%m-%dT%H:%M:%S.%f+08:00' # 可能存在的两种时间格式,均需要进行处理
datetimeFormat2 = '%Y-%m-%dT%H:%M:%S+08:00'
try:
d1 = datetime.strptime(str(date1), datetimeFormat)
except ValueError:
d1 = datetime.strptime(str(date1), datetimeFormat2)
try:
d2 = datetime.strptime(str(date2), datetimeFormat)
except ValueError:
d2 = datetime.strptime(str(date2), datetimeFormat2)
delta = d2 - d1 # 两个datetime类相减得到一个deltatime类的对象
miao = delta.seconds
if (miao == 0):
miao = ""
return miao
data = pd.read_excel(r'A_demo.xlsx') # 输入文档,要求是完整的已排序的过程性数据
# 以下为定义的全局变量
old_id = "" # 上一行的id
old_task_name = "" # 上一行的任务名称
oldold_answer_dict = {} # 上上一行、其打开页面上的作答结果
old_answer_dict = {} # 上一行、其打开页面上的作答结果
oldold_page = 0 # 上上一行、其打开的页面页数
old_page = 0 # 上一行、其打开的页面页数
oldold_time = "" # 上上一行、其动作的时间戳
old_time = "" # 上一行、其动作的时间戳
starttime = "" # 这两个time值用于保存计算页面访问时间的值
stoptime = ""
max_page = 0 # 由于计算AR列需要判断当前行是否为一个返回已完成页面的动作即当前行的页面号是否小于已访问过的最大序号
before_tag = [] # 由于计算before列需要获得每页的第一次修改行因此用该变量标注当前学生已修改过的页面页数
# 以下为最终写入新表格的新列数据
page_list = []
page_time_list = []
edit_list = []
AR_list = []
before_list = []
after_list = []
for index, row in data.iterrows():
# 开始按行遍历表格
print(index)
# 提取每行的重要信息
id = str(row['ticket_id'])
task_id = str(row["task_id"])
task_name = str(row["task_name"])
timestamp = str(row["timestamp"])
answer = json.loads(row['task_answer'])
frame = answer["frame"]
answer_dict = {}
if (frame != None):
# 若该行中能提取出作答信息则提取作答结果数据反之该行可能是用于分隔不同学生的null列
dataa = frame["data"]
page = int(dataa["page"])
# 以下代码段用于更新计算每页停留时间的starttime变量和stoptime变量并写入page和pagetime列
if ((old_id != id) or (old_task_name != task_name)):
print("###")
starttime = timestamp
page_list.append("")
page_time_list.append("")
elif (page != 1 and old_page != page):
print("&&&")
stoptime = timestamp
delta = caltime(starttime, stoptime)
starttime = timestamp
page_list.append(old_page)
page_time_list.append(delta)
else:
print("@@@")
page_list.append("")
page_time_list.append("")
# 以下代码段处理具体的学生作答结果写入edit、before、AR列
answer = dataa["answer"]
answer_list = list(answer)
### 以下代码段从学生的全部作答中提取当前操作页面的作答结果保存入全局变量answer_dict
if (task_name == "热身题【本题不计入总分】"):
# 热身题
pass
elif (task_name == "运动会问题"):
# 运动会问题
if (page == 1):
answer_dict['P3'] = answer_list[0]
elif (page == 2):
answer_dict['MM60311'] = answer_list[1]
elif (page == 3):
answer_dict['MM60321'] = answer_list[2]
elif (page == 4):
answer_dict['MM60331'] = answer_list[3]
elif (page == 5):
answer_dict['MM60341_wang'] = answer_list[4]
answer_dict['MM60341_ming'] = answer_list[5]
answer_dict['MM60341_zhang'] = answer_list[6]
answer_dict['MM60341_li'] = answer_list[7]
answer_dict['MM60341_hua'] = answer_list[8]
elif (page == 6):
answer_dict['MM60351'] = answer_list[9]
elif (task_name == "生活水平问题"):
# 生活水平问题
if (page == 1):
answer_dict['P4'] = answer_list[0]
elif (page == 2):
answer_dict['MM60411'] = answer_list[1]
elif (page == 3):
answer_dict['MM60421'] = answer_list[2]
elif (page == 4):
answer_dict['MM60431'] = answer_list[3]
answer_dict['MM60432'] = answer_list[4]
elif (page == 5):
answer_dict['MM60441'] = answer_list[5]
answer_dict['MM60442'] = answer_list[6]
elif (page == 6):
answer_dict['MM60451_1'] = answer_list[7]
answer_dict['MM60451_2'] = answer_list[8]
elif (page == 7):
answer_dict['MM60461_1'] = answer_list[9]
answer_dict['MM60461_2'] = answer_list[10]
answer_dict['MM60461_3'] = answer_list[11]
answer_dict['MM60461_4'] = answer_list[12]
# 以下处理判断【当前处理行的【上一行】】是否为修改动作是否为edit判断逻辑为
# ========================================================================
# 从当前的page要向前看两页old_page和oldold_page判断old_page页为修改页的条件
# old_page相对oldold页数不变
# old_page为选择题页
# 要求出现的-1数目没有改变
# old_page为填空题页
# old_page相对oldold字符串长度缩短且
# page相对old_page页数不变且字符串长度增加
# page相对old_page页数改变
# ========================================================================
# NOTES由于具体每页的题目数量和类型均不同因此难以整合出统一的处理方法函数
# 目前直接对每题、每页进行单独判断并处理,后期可优化
if (old_page != oldold_page):
edit_list.append("0")
before_list.append("")
if (max_page < old_page):
time_tmp = caltime(oldold_time, old_time)
if (time_tmp == ""):
time_tmp = 0
after_list.append(time_tmp)
else:
after_list.append("")
else:
print(before_tag)
after_list.append("")
if (old_page not in before_tag):
before_tag.append(old_page)
time_tmp = caltime(oldold_time, old_time)
if (time_tmp == ""):
time_tmp = 0
before_list.append(time_tmp)
else:
before_list.append("")
if (old_task_name == "运动会问题"):
if (old_page in [1, 2, 3, 6]):
old_answer = str(list(old_answer_dict.values()))
oldold_answer = str(list(oldold_answer_dict.values()))
if (old_answer.count("-1") == oldold_answer.count("-1")):
edit_list.append("1")
else:
edit_list.append("0")
elif (old_page == 4):
if (len(old_answer_dict["MM60331"]) < len(oldold_answer_dict["MM60331"])):
if (page != old_page):
edit_list.append("1")
elif (page == old_page and len(old_answer_dict["MM60331"]) < len(answer_dict["MM60331"])):
edit_list.append("1")
else:
edit_list.append("0")
else:
edit_list.append("0")
elif (old_page == 5):
old_answer = old_answer_dict["MM60341_wang"] + old_answer_dict["MM60341_ming"] + old_answer_dict["MM60341_zhang"] + old_answer_dict["MM60341_li"] + old_answer_dict["MM60341_hua"]
oldold_answer = oldold_answer_dict["MM60341_wang"] + oldold_answer_dict["MM60341_ming"] + oldold_answer_dict["MM60341_zhang"] + oldold_answer_dict["MM60341_li"] + oldold_answer_dict["MM60341_hua"]
if (len(old_answer) < len(oldold_answer)):
if (page != old_page):
edit_list.append("1")
elif (page == old_page):
now_answer = answer_dict["MM60341_wang"] + answer_dict["MM60341_ming"] + answer_dict["MM60341_zhang"] + answer_dict["MM60341_li"] + answer_dict["MM60341_hua"]
if (len(old_answer) < len(now_answer)):
edit_list.append("1")
else:
edit_list.append("0")
else:
edit_list.append("0")
else:
edit_list.append("0")
else:
edit_list.append("0")
elif (old_task_name == "生活水平问题"):
if (old_page in [1, 2, 3, 7]):
old_answer = str(list(old_answer_dict.values()))
oldold_answer = str(list(oldold_answer_dict.values()))
if (old_answer.count("-1") == oldold_answer.count("-1")):
edit_list.append("1")
else:
edit_list.append("0")
elif (old_page == 4):
old_answer = old_answer_dict["MM60431"] + old_answer_dict["MM60432"]
oldold_answer = oldold_answer_dict["MM60431"] + oldold_answer_dict["MM60432"]
if (len(old_answer) < len(oldold_answer)):
if (page != old_page):
edit_list.append("1")
elif (page == old_page):
now_answer = answer_dict["MM60431"] + answer_dict["MM60432"]
if (len(old_answer) < len(now_answer)):
edit_list.append("1")
else:
edit_list.append("0")
else:
edit_list.append("0")
else:
edit_list.append("0")
elif (old_page == 5):
old_answer = old_answer_dict["MM60441"] + old_answer_dict["MM60442"]
oldold_answer = oldold_answer_dict["MM60441"] + oldold_answer_dict["MM60442"]
if (len(old_answer) < len(oldold_answer)):
if (page != old_page):
edit_list.append("1")
elif (page == old_page):
now_answer = answer_dict["MM60441"] + answer_dict["MM60442"]
if (len(old_answer) < len(now_answer)):
edit_list.append("1")
else:
edit_list.append("0")
else:
edit_list.append("0")
else:
edit_list.append("0")
elif (old_page == 6):
old_answer = old_answer_dict["MM60451_1"] + old_answer_dict["MM60451_2"]
oldold_answer = oldold_answer_dict["MM60451_1"] + oldold_answer_dict["MM60451_2"]
if (len(old_answer) < len(oldold_answer)):
if (page != old_page):
edit_list.append("1")
elif (page == old_page):
now_answer = answer_dict["MM60451_1"] + answer_dict["MM60451_2"]
if (len(old_answer) < len(now_answer)):
edit_list.append("1")
else:
edit_list.append("0")
else:
edit_list.append("0")
else:
edit_list.append("0")
else:
edit_list.append("0")
else:
edit_list.append("0")
edit = int(edit_list[-1])
# 以下代码写入AR列满足AR列的前提条件是一个edit列
# 满足AR列的条件(edit==1) && (该行的页面序号小于最大已访问页面序号)
if (max_page < old_page):
max_page = old_page
if (edit == 1 and old_page < max_page):
AR_list.append("1")
else:
AR_list.append("0")
# 以下代码:常规动作,更新全局变量
oldold_page = old_page
old_page = page
oldold_time = old_time
old_time = timestamp
else:
# 若该行是无内容null页用于分隔不同的学生作答
# 以下代码:此时可以重新初始化各个每个学生特有的变量
page = 0
before_tag = []
max_page = 0
# 以下代码如果改变了用户id或task_name则当前时间设为starttime即第一道题的开始时间
if ((old_id != id) or (old_task_name != task_name)):
print("###")
starttime = timestamp
# 以下代码:向写入新列中填充合适的空值
page_list.append("")
page_time_list.append("")
edit_list.append("0")
AR_list.append("0")
before_list.append("")
# 以下代码但针对after列切换了用户则必然代表切换到新页因此向after列中填充合适的空值
time_tmp = caltime(oldold_time, old_time)
if (time_tmp == ""):
time_tmp = 0
after_list.append(time_tmp)
# 以下代码:常规动作,更新全局变量
oldold_time = old_time
old_time = timestamp
oldold_page = old_page
old_page = page
# 以下代码:常规动作,更新全局变量
old_id = id
old_task_name = task_name
oldold_answer_dict = old_answer_dict
old_answer_dict = answer_dict
# 以下代码:将四个列表向前循环一位
x = edit_list.pop(0)
edit_list.append(x)
x = AR_list.pop(0)
AR_list.append(x)
x = before_list.pop(0)
before_list.append(x)
x = after_list.pop(0)
after_list.append(x)
# 以下代码将6个新列表写入原始pandas数据
col_name = data.columns.tolist()
col_name.insert(col_name.index('task_answer')+1, 'AR')
col_name.insert(col_name.index('task_answer')+1, 'after')
col_name.insert(col_name.index('task_answer')+1, 'before')
col_name.insert(col_name.index('task_answer')+1, 'edit')
col_name.insert(col_name.index('task_answer')+1, 'pagetime')
col_name.insert(col_name.index('task_answer')+1, 'page')
data = data.reindex(columns=col_name)
data['AR'] = AR_list
data['after'] = after_list
data['before'] = before_list
data['edit'] = edit_list
data['pagetime'] = page_time_list
data['page'] = page_list
data.to_excel('A_demo_out.xlsx') # 数据导出