openbrain/pydata-han/Process/getcharacter_processdata_A.py

116 lines
4.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# ==========================================================================
# 处理过程性数据获取学生特征值 - step2
# 本脚本以完成了预处理的【已排序的】学生作答过程性作为待处理的原始数据文档,对其进行
# 特征值抽取,获得每个学生对每个作答页面的以下特征值数据:
# Time:在每个页面上停留的时间总计每次大于1秒
# Revise:在每个页面上修改的次数(总计)
# Repeat:停留的次数总计大于1秒
# Before:进入每个小题作答前时间
# After:进入每个小题作答后时间
# AR:每个小题作答完成后再返回被修改次数
# ==========================================================================
import json
import pandas as pd
from sqlalchemy import column
data = pd.read_excel(r"A_demo_out.xlsx")
id_dict = {} # 最终全部数据都保存在这一变量中可以直接转换为pandas类型对象进而写入excel文件
for index, row in data.iterrows():
# 逐行读取
id = str(row["ticket_id"])
task_name = str(row["task_name"])
if (id not in id_dict.keys()):
id_dict[id] = {}
id_com = id_dict[id]
# 以下代码:读取页数信息、读取预处理得到的关键值,若原表格中为空值则此处赋值为-1
edit = int(row["edit"])
AR = int(row["AR"])
answer = json.loads(row['task_answer'])
frame = answer["frame"]
if (frame == None):
page_now = -1
else:
page_now = int(frame["data"]["page"])
if (pd.isnull(row["page"])):
page = -1
else:
page = int(row["page"])
if (pd.isnull(row["pagetime"])):
pagetime = -1
else:
pagetime = float(row["pagetime"])
if (pd.isnull(row["before"])):
before = -1
else:
before = float(row["before"])
if (pd.isnull(row["after"])):
after = -1
else:
after = float(row["after"])
if (task_name == "运动会问题"):
if (page != -1 and pagetime != -1):
column_time = "sports_Time_A" + str(page) # 停留总时间
column_repeat = "sports_Repeat_A" + str(page) # 返回停留的次数
if (column_time in id_com.keys()):
id_com[column_time] = id_com[column_time] + pagetime
id_com[column_repeat] = id_com[column_repeat] + 1
else:
id_com[column_time] = pagetime
id_com[column_repeat] = 0
if (page_now != -1):
column_revise = "sports_Revise_A" + str(page_now) # 修改总次数
if (column_revise in id_com.keys()):
id_com[column_revise] = id_com[column_revise] + edit
else:
id_com[column_revise] = 0
column_AR = "sports_AR_A" + str(page_now) # 完成后返回修改总次数
if (column_AR in id_com.keys()):
id_com[column_AR] = id_com[column_AR] + AR
else:
id_com[column_AR] = 0
if (before != -1):
column_before = "sports_before_A" + str(page_now)
id_com[column_before] = before
if (after != -1):
column_after = "sports_after_A" + str(page_now-1)
id_com[column_after] = after
elif (task_name == "生活水平问题"):
if (page != -1 and pagetime != -1):
column_time = "life_Time_A" + str(page) # 停留总时间
column_repeat = "life_Repeat_A" + str(page) # 返回停留的次数
if (column_time in id_com.keys()):
id_com[column_time] = id_com[column_time] + pagetime
id_com[column_repeat] = id_com[column_repeat] + 1
else:
id_com[column_time] = pagetime
id_com[column_repeat] = 0
if (page_now != -1):
column_revise = "life_Revise_A" + str(page_now) # 修改总次数
if (column_revise in id_com.keys()):
id_com[column_revise] = id_com[column_revise] + edit
else:
id_com[column_revise] = 0
column_AR = "life_AR_A" + str(page_now) # 完成后返回修改总次数
if (column_AR in id_com.keys()):
id_com[column_AR] = id_com[column_AR] + AR
else:
id_com[column_AR] = 0
if (before != -1):
column_before = "life_before_A" + str(page_now)
id_com[column_before] = before
if (after != -1):
column_after = "life_after_A" + str(page_now-1)
id_com[column_after] = after
print(id_com)
id_dict[id] = id_com
data_df = pd.DataFrame(id_dict).T
data_df = data_df.fillna(-1) # 空缺位置填充-1
data_df.to_excel("A_demo_statre.xlsx")