a877aed45f
Change-Id: I16cd7730c1e0732253ac52f51010f6b813295aa7
124 lines
5.6 KiB
Python
124 lines
5.6 KiB
Python
"""
|
|
Author: Weisen Pan
|
|
Date: 2023-10-24
|
|
"""
|
|
import pandas as pd
|
|
import numpy as np
|
|
from tqdm import tqdm
|
|
|
|
def find_pos(out_degree_matrix, num_nodes):
|
|
stage = np.zeros(num_nodes)
|
|
signal = True
|
|
while signal:
|
|
temp = stage.copy()
|
|
for m in range(num_nodes):
|
|
for n in range(num_nodes):
|
|
if out_degree_matrix[m, n] != 0:
|
|
stage[n] = max(stage[n], stage[m] + 1)
|
|
if (temp == stage).all():
|
|
signal = False
|
|
return stage
|
|
|
|
def create_position(pos, num_feat):
|
|
pe = np.array([[[posit / (10000.0 ** (i // 2 * 2.0 / num_feat)) for i in range(num_feat)] for posit in posi] for posi in pos])
|
|
pe[:, :, 0::2] = np.sin(pe[:, :, 0::2])
|
|
pe[:, :, 1::2] = np.cos(pe[:, :, 1::2])
|
|
return pe
|
|
|
|
def create_attn_mask(tensor, num_heads, num_nodes):
|
|
mask = np.zeros(((tensor.shape[0] * num_heads), tensor.shape[1], tensor.shape[2]))
|
|
for x in range(0, mask.shape[0], num_heads):
|
|
mask[x:x + num_heads] = tensor[x // num_heads] + np.eye(num_nodes)
|
|
return mask.reshape(tensor.shape[0], num_heads, tensor.shape[1], tensor.shape[2])
|
|
|
|
def prepare_data_exp1_dag(pred_task, pred_tgt, pred_mode):
|
|
direct = './datasets_exp1/%s%s/%s' % (pred_task, pred_tgt, pred_mode)
|
|
df_train = pd.read_csv(direct + '/train.csv')
|
|
df_val = pd.read_csv(direct + '/val.csv')
|
|
df_test = pd.read_csv(direct + '/test.csv')
|
|
dag_direct = './datasets_exp1/%s%s/' % (pred_task, pred_tgt)
|
|
df_dag_train = pd.read_csv(dag_direct + '/train_daginfo.csv')
|
|
df_dag_val = pd.read_csv(dag_direct + '/val_daginfo.csv')
|
|
df_dag_test = pd.read_csv(dag_direct + '/test_daginfo.csv')
|
|
return df_train, df_val, df_test, df_dag_train, df_dag_val, df_dag_test
|
|
|
|
def prepare_data_exp1(pred_task, pred_tgt, pred_mode):
|
|
direct = './datasets_exp1/%s%s/%s' % (pred_task, pred_tgt, pred_mode)
|
|
df_train = pd.read_csv(direct + '/train.csv')
|
|
df_val = pd.read_csv(direct + '/val.csv')
|
|
df_test = pd.read_csv(direct + '/test.csv')
|
|
return df_train, df_val, df_test
|
|
|
|
def prepare_data_exp23_dag(split):
|
|
direct = './datasets_exp2_3/%s' % split
|
|
df_train = pd.read_csv(direct + '/train.csv')
|
|
df_val = pd.read_csv(direct + '/val.csv')
|
|
df_test = pd.read_csv(direct + '/test.csv')
|
|
df_dag_train = pd.read_csv(direct + '/train_daginfo.csv')
|
|
df_dag_val = pd.read_csv(direct + '/val_daginfo.csv')
|
|
df_dag_test = pd.read_csv(direct + '/test_daginfo.csv')
|
|
return df_train, df_val, df_test, df_dag_train, df_dag_val, df_dag_test
|
|
|
|
def prepare_data_exp23(split):
|
|
direct = './datasets_exp2_3/%s' % split
|
|
df_train = pd.read_csv(direct + '/train.csv')
|
|
df_val = pd.read_csv(direct + '/val.csv')
|
|
df_test = pd.read_csv(direct + '/test.csv')
|
|
return df_train, df_val, df_test
|
|
|
|
def preprocess_data_exp1_dag(pred_task, pred_tgt, pred_mode, num_feat=34, num_task=7, num_head=8):
|
|
df_train, df_val, df_test, df_dag_train, df_dag_val, df_dag_test = prepare_data_exp1_dag(pred_task, pred_tgt, pred_mode)
|
|
arr1 = np.array(df_train.iloc[:, :-1])
|
|
arr2 = np.array(df_val.iloc[:, :-1])
|
|
arr3 = np.array(df_test.iloc[:, :-1])
|
|
data = np.vstack((arr1.reshape(arr1.shape[0] * num_task, num_feat),
|
|
arr2.reshape(arr2.shape[0] * num_task, num_feat),
|
|
arr3.reshape(arr3.shape[0] * num_task, num_feat)))
|
|
data = (data - data.min(0)) / (data.max(0) - data.min(0) + 1e-9)
|
|
data1 = data[:arr1.shape[0] * num_task, :]
|
|
data2 = data[arr1.shape[0] * num_task:arr1.shape[0] * num_task + arr2.shape[0] * num_task, :]
|
|
data3 = data[arr1.shape[0] * num_task + arr2.shape[0] * num_task:, :]
|
|
data1 = data1.reshape(-1, num_task, num_feat)
|
|
data2 = data2.reshape(-1, num_task, num_feat)
|
|
data3 = data3.reshape(-1, num_task, num_feat)
|
|
dag1 = df_dag_train.to_numpy().reshape(-1, num_task, num_task * 2 + 1)
|
|
dag2 = df_dag_val.to_numpy().reshape(-1, num_task, num_task * 2 + 1)
|
|
dag3 = df_dag_test.to_numpy().reshape(-1, num_task, num_task * 2 + 1)
|
|
dag1 = dag1[:, :, 1:]
|
|
dag2 = dag2[:, :, 1:]
|
|
dag3 = dag3[:, :, 1:]
|
|
dagout1 = dag1[:, :, :7]
|
|
dagout2 = dag2[:, :, :7]
|
|
dagout3 = dag3[:, :, :7]
|
|
dagin1 = dag1[:, :, 7:]
|
|
dagin2 = dag2[:, :, 7:]
|
|
dagin3 = dag3[:, :, 7:]
|
|
mask1 = dagin1 + dagout1
|
|
mask2 = dagin2 + dagout2
|
|
mask3 = dagin3 + dagout3
|
|
pos1 = np.zeros((arr1.shape[0], 7))
|
|
for x in range(pos1.shape[0]):
|
|
pos1[x] = find_pos(dagout1[x], 7)
|
|
pos2 = np.zeros((arr2.shape[0], 7))
|
|
for x in range(pos2.shape[0]):
|
|
pos2[x] = find_pos(dagout2[x], 7)
|
|
pos3 = np.zeros((arr3.shape[0], 7))
|
|
for x in range(pos3.shape[0]):
|
|
pos3[x] = find_pos(dagout3[x], 7)
|
|
position1 = create_position(pos1, num_feat)
|
|
position2 = create_position(pos2, num_feat)
|
|
position3 = create_position(pos3, num_feat)
|
|
mask_train = create_attn_mask(mask1, num_heads=num_head, num_nodes=num_task)
|
|
mask_val = create_attn_mask(mask2, num_heads=num_head, num_nodes=num_task)
|
|
mask_test = create_attn_mask(mask3, num_heads=num_head, num_nodes=num_task)
|
|
data1 = np.array(data1, dtype=np.float32)
|
|
data2 = np.array(data2, dtype=np.float32)
|
|
data3 = np.array(data3, dtype=np.float32)
|
|
position1 = np.array(position1, dtype=np.float32)
|
|
position2 = np.array(position2, dtype=np.float32)
|
|
position3 = np.array(position3, dtype=np.float32)
|
|
train_data = [(data1[x], df_train.iloc[x, -1], position1[x], mask_train[x]) for x in range(data1.shape[0])]
|
|
val_data = [(data2[x], df_val.iloc[x, -1], position2[x], mask_val[x]) for x in range(data2.shape[0])]
|
|
test_data = [(data3[x], df_test.iloc[x, -1], position3[x], mask_test[x]) for x in range(data3.shape[0])]
|
|
return train_data, val_data, test_data
|