forked from Open-CT/openbrain
116 lines
4.7 KiB
Python
116 lines
4.7 KiB
Python
# ==========================================================================
|
||
# 处理过程性数据获取学生特征值 - step2
|
||
# 本脚本以完成了预处理的【已排序的】学生作答过程性作为待处理的原始数据文档,对其进行
|
||
# 特征值抽取,获得每个学生对每个作答页面的以下特征值数据:
|
||
# Time:在每个页面上停留的时间(总计,每次大于1秒)
|
||
# Revise:在每个页面上修改的次数(总计)
|
||
# Repeat:停留的次数(总计,大于1秒)
|
||
# Before:进入每个小题作答前时间
|
||
# After:进入每个小题作答后时间
|
||
# AR:每个小题作答完成后再返回被修改次数
|
||
# ==========================================================================
|
||
|
||
import json
|
||
import pandas as pd
|
||
from sqlalchemy import column
|
||
|
||
data = pd.read_excel(r"A_demo_out.xlsx")
|
||
id_dict = {} # 最终全部数据都保存在这一变量中,可以直接转换为pandas类型对象,进而写入excel文件
|
||
for index, row in data.iterrows():
|
||
# 逐行读取
|
||
id = str(row["ticket_id"])
|
||
task_name = str(row["task_name"])
|
||
if (id not in id_dict.keys()):
|
||
id_dict[id] = {}
|
||
id_com = id_dict[id]
|
||
# 以下代码:读取页数信息、读取预处理得到的关键值,若原表格中为空值则此处赋值为-1
|
||
edit = int(row["edit"])
|
||
AR = int(row["AR"])
|
||
answer = json.loads(row['task_answer'])
|
||
frame = answer["frame"]
|
||
if (frame == None):
|
||
page_now = -1
|
||
else:
|
||
page_now = int(frame["data"]["page"])
|
||
|
||
if (pd.isnull(row["page"])):
|
||
page = -1
|
||
else:
|
||
page = int(row["page"])
|
||
|
||
if (pd.isnull(row["pagetime"])):
|
||
pagetime = -1
|
||
else:
|
||
pagetime = float(row["pagetime"])
|
||
|
||
if (pd.isnull(row["before"])):
|
||
before = -1
|
||
else:
|
||
before = float(row["before"])
|
||
|
||
if (pd.isnull(row["after"])):
|
||
after = -1
|
||
else:
|
||
after = float(row["after"])
|
||
|
||
if (task_name == "运动会问题"):
|
||
if (page != -1 and pagetime != -1):
|
||
column_time = "sports_Time_A" + str(page) # 停留总时间
|
||
column_repeat = "sports_Repeat_A" + str(page) # 返回停留的次数
|
||
if (column_time in id_com.keys()):
|
||
id_com[column_time] = id_com[column_time] + pagetime
|
||
id_com[column_repeat] = id_com[column_repeat] + 1
|
||
else:
|
||
id_com[column_time] = pagetime
|
||
id_com[column_repeat] = 0
|
||
if (page_now != -1):
|
||
column_revise = "sports_Revise_A" + str(page_now) # 修改总次数
|
||
if (column_revise in id_com.keys()):
|
||
id_com[column_revise] = id_com[column_revise] + edit
|
||
else:
|
||
id_com[column_revise] = 0
|
||
column_AR = "sports_AR_A" + str(page_now) # 完成后返回修改总次数
|
||
if (column_AR in id_com.keys()):
|
||
id_com[column_AR] = id_com[column_AR] + AR
|
||
else:
|
||
id_com[column_AR] = 0
|
||
if (before != -1):
|
||
column_before = "sports_before_A" + str(page_now)
|
||
id_com[column_before] = before
|
||
if (after != -1):
|
||
column_after = "sports_after_A" + str(page_now-1)
|
||
id_com[column_after] = after
|
||
elif (task_name == "生活水平问题"):
|
||
if (page != -1 and pagetime != -1):
|
||
column_time = "life_Time_A" + str(page) # 停留总时间
|
||
column_repeat = "life_Repeat_A" + str(page) # 返回停留的次数
|
||
if (column_time in id_com.keys()):
|
||
id_com[column_time] = id_com[column_time] + pagetime
|
||
id_com[column_repeat] = id_com[column_repeat] + 1
|
||
else:
|
||
id_com[column_time] = pagetime
|
||
id_com[column_repeat] = 0
|
||
if (page_now != -1):
|
||
column_revise = "life_Revise_A" + str(page_now) # 修改总次数
|
||
if (column_revise in id_com.keys()):
|
||
id_com[column_revise] = id_com[column_revise] + edit
|
||
else:
|
||
id_com[column_revise] = 0
|
||
column_AR = "life_AR_A" + str(page_now) # 完成后返回修改总次数
|
||
if (column_AR in id_com.keys()):
|
||
id_com[column_AR] = id_com[column_AR] + AR
|
||
else:
|
||
id_com[column_AR] = 0
|
||
if (before != -1):
|
||
column_before = "life_before_A" + str(page_now)
|
||
id_com[column_before] = before
|
||
if (after != -1):
|
||
column_after = "life_after_A" + str(page_now-1)
|
||
id_com[column_after] = after
|
||
print(id_com)
|
||
id_dict[id] = id_com
|
||
|
||
|
||
data_df = pd.DataFrame(id_dict).T
|
||
data_df = data_df.fillna(-1) # 空缺位置填充-1
|
||
data_df.to_excel("A_demo_statre.xlsx") |