main function./run.py
args = parser.parse_args()
args.use_gpu = True if torch.cuda.is_available() and args.use_gpu else False
if args.use_gpu and args.use_multi_gpu:
args.dvices = args.devices.replace(' ', '')
device_ids = args.devices.split(',')
args.device_ids = [int(id_) for id_ in device_ids]
args.gpu = args.device_ids[0]
print('Args in experiment:')
print(args)
if args.task_name == 'long_term_forecast':
Exp = Exp_Long_Term_Forecast
elif args.task_name == 'short_term_forecast':
Exp = Exp_Short_Term_Forecast
elif args.task_name == 'imputation':
Exp = Exp_Imputation
elif args.task_name == 'anomaly_detection':
Exp = Exp_Anomaly_Detection
elif args.task_name == 'classification':
Exp = Exp_Classification
else:
Exp = Exp_Long_Term_Forecast
if args.is_training:
for ii in range(args.itr):
# setting record of experiments
setting = '{
}_{
}_{
}_{
}_ft{
}_sl{
}_ll{
}_pl{
}_dm{
}_nh{
}_el{
}_dl{
}_df{
}_fc{
}_eb{
}_dt{
}_{
}_{
}'.format(
args.task_name,
args.model_id,
args.model,
args.data,
args.features,
args.seq_len,
args.label_len,
args.pred_len,
args.d_model,
args.n_heads,
args.e_layers,
args.d_layers,
args.d_ff,
args.factor,
args.embed,
args.distil,
args.des, ii)
exp = Exp(args) # set experiments
print('>>>>>>>start training : {
}>>>>>>>>>>>>>>>>>>>>>>>>>>'.format(setting))
exp.train(setting)
print('>>>>>>>testing : {
}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.format(setting))
exp.test(setting)
torch.cuda.empty_cache()
Look at this sentence first exp = Exp(args)
.
Data read./data_provider/data_loader.py
from
./run.py
call Exp类
, enter
./exp/exp_classification.py
; Here, train_data, train_loader = self._get_data(flag='TRAIN') test_data, test_loader = self._get_data(flag='TEST')
the training set and test set are first read once, the purpose is to initialize the network structure. It will be read again during training later:
class Exp_Classification(Exp_Basic):
def __init__(self, args):
super(Exp_Classification, self).__init__(args)
def _build_model(self):
# model input depends on data
train_data, train_loader = self._get_data(flag='TRAIN')
test_data, test_loader = self._get_data(flag='TEST')
self.args.seq_len = max(train_data.max_seq_len, test_data.max_seq_len)
self.args.pred_len = 0
self.args.enc_in = train_data.feature_df.shape[1]
self.args.num_class = len(train_data.class_names)
# model init
model = self.model_dict[self.args.model].Model(self.args).float()
if self.args.use_multi_gpu and self.args.use_gpu:
model = nn.DataParallel(model, device_ids=self.args.device_ids)
return model
def _get_data(self, flag):
data_set, data_loader = data_provider(self.args, flag)
return data_set, data_loader
read data, enter first
./data_provider/data_factory.py
, it can be found that the UEAloader is called, located in
./data_provider/data_loader.py
class UEAloader(Dataset):
"""
Dataset class for datasets included in:
Time Series Classification Archive (www.timeseriesclassification.com)
Argument:
limit_size: float in (0, 1) for debug
Attributes:
all_df: (num_samples * seq_len, num_columns) dataframe indexed by integer indices, with multiple rows corresponding to the same index (sample).
Each row is a time step; Each column contains either metadata (e.g. timestamp) or a feature.
feature_df: (num_samples * seq_len, feat_dim) dataframe; contains the subset of columns of `all_df` which correspond to selected features
feature_names: names of columns contained in `feature_df` (same as feature_df.columns)
all_IDs: (num_samples,) series of IDs contained in `all_df`/`feature_df` (same as all_df.index.unique() )
labels_df: (num_samples, num_labels) pd.DataFrame of label(s) for each sample
max_seq_len: maximum sequence (time series) length. If None, script argument `max_seq_len` will be used.
(Moreover, script argument overrides this attribute)
"""
def __init__(self, root_path, file_list=None, limit_size=None, flag=None):
self.root_path = root_path
self.all_df, self.labels_df = self.load_all(root_path, file_list=file_list, flag=flag)
self.all_IDs = self.all_df.index.unique() # all sample IDs (integer indices 0 ... num_samples-1)
if limit_size is not None:
if limit_size > 1:
limit_size = int(limit_size)
else: # interpret as proportion if in (0, 1]
limit_size = int(limit_size * len(self.all_IDs))
self.all_IDs = self.all_IDs[:limit_size]
self.all_df = self.all_df.loc[self.all_IDs]
# use all features
self.feature_names = self.all_df.columns
self.feature_df = self.all_df
# pre_process
normalizer = Normalizer()
self.feature_df = normalizer.normalize(self.feature_df)
# print(len(self.all_IDs))
def load_all(self, root_path, file_list=None, flag=None):
"""
Loads datasets from csv files contained in `root_path` into a dataframe, optionally choosing from `pattern`
Args:
root_path: directory containing all individual .csv files
file_list: optionally, provide a list of file paths within `root_path` to consider.
Otherwise, entire `root_path` contents will be used.
Returns:
all_df: a single (possibly concatenated) dataframe with all data corresponding to specified files
labels_df: dataframe containing label(s) for each sample
"""
# Select paths for training and evaluation
if file_list is None:
data_paths = glob.glob(os.path.join(root_path, '*')) # list of all paths
else:
data_paths = [os.path.join(root_path, p) for p in file_list]
if len(data_paths) == 0:
raise Exception('No files found using: {}'.format(os.path.join(root_path, '*')))
if flag is not None:
data_paths = list(filter(lambda x: re.search(flag, x), data_paths))
input_paths = [p for p in data_paths if os.path.isfile(p) and p.endswith('.ts')]
if len(input_paths) == 0:
raise Exception("No .ts files found using pattern: '{}'".format(pattern))
all_df, labels_df = self.load_single(input_paths[0]) # a single file contains dataset
return all_df, labels_df
def load_single(self, filepath):
df, labels = load_data.load_from_tsfile_to_dataframe(filepath, return_separate_X_and_y=True,
replace_missing_vals_with='NaN')
labels = pd.Series(labels, dtype="category")
self.class_names = labels.cat.categories
labels_df = pd.DataFrame(labels.cat.codes,
dtype=np.int8) # int8-32 gives an error when using nn.CrossEntropyLoss
lengths = df.applymap(
lambda x: len(x)).values # (num_samples, num_dimensions) array containing the length of each series
horiz_diffs = np.abs(lengths - np.expand_dims(lengths[:, 0], -1))
if np.sum(horiz_diffs) > 0: # if any row (sample) has varying length across dimensions
df = df.applymap(subsample)
lengths = df.applymap(lambda x: len(x)).values
vert_diffs = np.abs(lengths - np.expand_dims(lengths[0, :], 0))
if np.sum(vert_diffs) > 0: # if any column (dimension) has varying length across samples
self.max_seq_len = int(np.max(lengths[:, 0]))
else:
self.max_seq_len = lengths[0, 0]
df = pd.concat((pd.DataFrame({
col: df.loc[row, col] for col in df.columns}).reset_index(drop=True).set_index(
pd.Series(lengths[row, 0] * [row])) for row in range(df.shape[0])), axis=0)
# Replace NaN values
grp = df.groupby(by=df.index)
df = grp.transform(interpolate_missing)
return df, labels_df
Network training and inference./exp/exp_classification
class Exp_Classification(Exp_Basic):
def __init__(self, args):
super(Exp_Classification, self).__init__(args)
def _build_model(self):
# model input depends on data
train_data, train_loader = self._get_data(flag='TRAIN')
test_data, test_loader = self._get_data(flag='TEST')
self.args.seq_len = max(train_data.max_seq_len, test_data.max_seq_len)
self.args.pred_len = 0
self.args.enc_in = train_data.feature_df.shape[1]
self.args.num_class = len(train_data.class_names)
# model init
model = self.model_dict[self.args.model].Model(self.args).float()
if self.args.use_multi_gpu and self.args.use_gpu:
model = nn.DataParallel(model, device_ids=self.args.device_ids)
return model
def _get_data(self, flag):
data_set, data_loader = data_provider(self.args, flag)
return data_set, data_loader
After reading the data, set the network structure parameters according to the data, and then initialize the model self.model_dict[self.args.model].Model(self.args).float()
where `model_dict is in
./exp/exp_classification.py defines
definition:
from models import Autoformer, Transformer, TimesNet, Nonstationary_Transformer, DLinear, FEDformer, \
Informer, LightTS, Reformer, ETSformer, Pyraformer, PatchTST, MICN, Crossformer
class Exp_Basic(object):
def __init__(self, args):
self.args = args
self.model_dict = {
'TimesNet': TimesNet,
'Autoformer': Autoformer,
'Transformer': Transformer,
'Nonstationary_Transformer': Nonstationary_Transformer,
'DLinear': DLinear,
'FEDformer': FEDformer,
'Informer': Informer,
'LightTS': LightTS,
'Reformer': Reformer,
'ETSformer': ETSformer,
'PatchTST': PatchTST,
'Pyraformer': Pyraformer,
'MICN': MICN,
'Crossformer': Crossformer,
}
self.device = self._acquire_device()
self.model = self._build_model().to(self.device)
def _build_model(self):
raise NotImplementedError
return None
so here it comes
./models/TimesNet.py
The model function:
class Model(nn.Module):
"""
Paper link: https://openreview.net/pdf?id=ju_Uqw384Oq
"""
def __init__(self, configs):
super(Model, self).__init__()
self.configs = configs
self.task_name = configs.task_name
self.seq_len = configs.seq_len
self.label_len = configs.label_len
self.pred_len = configs.pred_len
self.model = nn.ModuleList([TimesBlock(configs)
for _ in range(configs.e_layers)])
self.enc_embedding = DataEmbedding(configs.enc_in, configs.d_model, configs.embed, configs.freq,
configs.dropout)
self.layer = configs.e_layers
self.layer_norm = nn.LayerNorm(configs.d_model)
if self.task_name == 'long_term_forecast' or self.task_name == 'short_term_forecast':
self.predict_linear = nn.Linear(
self.seq_len, self.pred_len + self.seq_len)
self.projection = nn.Linear(
configs.d_model, configs.c_out, bias=True)
if self.task_name == 'imputation' or self.task_name == 'anomaly_detection':
self.projection = nn.Linear(
configs.d_model, configs.c_out, bias=True)
if self.task_name == 'classification':
self.act = F.gelu
self.dropout = nn.Dropout(configs.dropout)
self.projection = nn.Linear(
configs.d_model * configs.seq_len, configs.num_class)
Pay attention to this sentence:
self.model = nn.ModuleList([TimesBlock(configs)
for _ in range(configs.e_layers)])
It can be found that the network is composed of many TimesBlocks:
class TimesBlock(nn.Module):
def __init__(self, configs):
super(TimesBlock, self).__init__()
self.seq_len = configs.seq_len
self.pred_len = configs.pred_len
self.k = configs.top_k
# parameter-efficient design
self.conv = nn.Sequential(
Inception_Block_V1(configs.d_model, configs.d_ff,
num_kernels=configs.num_kernels),
nn.GELU(),
Inception_Block_V1(configs.d_ff, configs.d_model,
num_kernels=configs.num_kernels)
)
def forward(self, x):
print(x.shape)
B, T, N = x.size()
period_list, period_weight = FFT_for_Period(x, self.k)
print('period_list',period_list.shape)
print('period_weight',period_weight.shape)
res = []
for i in range(self.k):
period = period_list[i]
# padding
if (self.seq_len + self.pred_len) % period != 0:
length = (((self.seq_len + self.pred_len) // period) + 1) * period
padding = torch.zeros([x.shape[0], (length - (self.seq_len + self.pred_len)), x.shape[2]]).to(x.device)
out = torch.cat([x, padding], dim=1)
else:
length = (self.seq_len + self.pred_len)
out = x
# reshape
print('out-reshape-before',out.shape)
out = out.reshape(B, length // period, period,
N).permute(0, 3, 1, 2).contiguous()
print('out-reshape-after',out.shape)
# 2D conv: from 1d Variation to 2d Variation
out = self.conv(out)
# reshape back
out = out.permute(0, 2, 3, 1).reshape(B, -1, N)
print('out',out.shape)
res.append(out[:, :(self.seq_len + self.pred_len), :])
print('res',res.shape)
res = torch.stack(res, dim=-1)
print(res.shape)
# adaptive aggregation
period_weight = F.softmax(period_weight, dim=1)
period_weight = period_weight.unsqueeze(
1).unsqueeze(1).repeat(1, T, N, 1)
res = torch.sum(res * period_weight, -1)
# residual connection
res = res + x
return res
Network Model Structure Design
When batch_size is input, the shape of x is batch_x torch.Size([16, 29, 12])
;
./exp/exp_classification.py
After entering self.model
,
self.model = self._build_model().to(self.device)
model = self.model_dict[self.args.model].Model(self.args).float()
./model/TimesNet.py
class Model(nn.Module):
"""
Paper link: https://openreview.net/pdf?id=ju_Uqw384Oq
"""
def __init__(self, configs):
super(Model, self).__init__()
self.configs = configs
self.task_name = configs.task_name
self.seq_len = configs.seq_len
self.label_len = configs.label_len
self.pred_len = configs.pred_len
self.model = nn.ModuleList([TimesBlock(configs)
for _ in range(configs.e_layers)])
self.enc_embedding = DataEmbedding(configs.enc_in, configs.d_model, configs.embed, configs.freq,
configs.dropout)
self.layer = configs.e_layers
self.layer_norm = nn.LayerNorm(configs.d_model)
if self.task_name == 'long_term_forecast' or self.task_name == 'short_term_forecast':
self.predict_linear = nn.Linear(
self.seq_len, self.pred_len + self.seq_len)
self.projection = nn.Linear(
configs.d_model, configs.c_out, bias=True)
if self.task_name == 'imputation' or self.task_name == 'anomaly_detection':
self.projection = nn.Linear(
configs.d_model, configs.c_out, bias=True)
if self.task_name == 'classification':
self.act = F.gelu
self.dropout = nn.Dropout(configs.dropout)
self.projection = nn.Linear(
configs.d_model * configs.seq_len, configs.num_class)
For that outputs = self.model(batch_x, padding_mask, None, None)
, the forward() function should be called directly:
def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None):
if self.task_name == 'long_term_forecast' or self.task_name == 'short_term_forecast':
dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
return dec_out[:, -self.pred_len:, :] # [B, L, D]
if self.task_name == 'imputation':
dec_out = self.imputation(
x_enc, x_mark_enc, x_dec, x_mark_dec, mask)
return dec_out # [B, L, D]
if self.task_name == 'anomaly_detection':
dec_out = self.anomaly_detection(x_enc)
return dec_out # [B, L, D]
if self.task_name == 'classification':
dec_out = self.classification(x_enc, x_mark_enc)
return dec_out # [B, N]
return None
Print it in the forward function x_enc
, or x_enc forward: torch.Size([16, 29, 12])
;
def classification(self, x_enc, x_mark_enc):
# embedding
enc_out = self.enc_embedding(x_enc, None) # [B,T,C]
# TimesNet
for i in range(self.layer):
enc_out = self.layer_norm(self.model[i](enc_out))
# Output
# the output transformer encoder/decoder embeddings don't include non-linearity
output = self.act(enc_out)
output = self.dropout(output)
# zero-out padding embeddings
output = output * x_mark_enc.unsqueeze(-1)
# (batch_size, seq_length * d_model)
output = output.reshape(output.shape[0], -1)
output = self.projection(output) # (batch_size, num_classes)
return output
class DataEmbedding(nn.Module):
def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
super(DataEmbedding, self).__init__()
self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
self.position_embedding = PositionalEmbedding(d_model=d_model)
self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type,
freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
d_model=d_model, embed_type=embed_type, freq=freq)
self.dropout = nn.Dropout(p=dropout)
def forward(self, x, x_mark):
if x_mark is None:
x = self.value_embedding(x) + self.position_embedding(x)
else:
x = self.value_embedding(
x) + self.temporal_embedding(x_mark) + self.position_embedding(x)
return self.dropout(x)
After embedding, it becomes enc_out_classification torch.Size([16, 29, 64])
.
Among them, 16 is batch_size, 29 is length, and 12 is the number of channels (number of dimensions); that is, he changed from 12 channels to 64 channels.
FFT, frequency transform:
for i in range(self.layer):
enc_out = self.layer_norm(self.model[i](enc_out))
Called self.model:
def forward(self, x):
print(x.shape)
B, T, N = x.size()
period_list, period_weight = FFT_for_Period(x, self.k)
print('period_list',period_list.shape)
print('period_weight',period_weight.shape)
res = []
for i in range(self.k):
period = period_list[i]
# padding
if (self.seq_len + self.pred_len) % period != 0:
length = (((self.seq_len + self.pred_len) // period) + 1) * period
padding = torch.zeros([x.shape[0], (length - (self.seq_len + self.pred_len)), x.shape[2]]).to(x.device)
out = torch.cat([x, padding], dim=1)
else:
length = (self.seq_len + self.pred_len)
out = x
# reshape
print('out-reshape-before',out.shape)
out = out.reshape(B, length // period, period,
N).permute(0, 3, 1, 2).contiguous()
print('out-reshape-after',out.shape)
# 2D conv: from 1d Variation to 2d Variation
out = self.conv(out)
# reshape back
out = out.permute(0, 2, 3, 1).reshape(B, -1, N)
print('out',out.shape)
res.append(out[:, :(self.seq_len + self.pred_len), :])
print('res',res.shape)
res = torch.stack(res, dim=-1)
print(res.shape)
# adaptive aggregation
period_weight = F.softmax(period_weight, dim=1)
period_weight = period_weight.unsqueeze(
1).unsqueeze(1).repeat(1, T, N, 1)
res = torch.sum(res * period_weight, -1)
# residual connection
res = res + x
return res
Among them, FFT_for_Period
the function is:
def FFT_for_Period(x, k=2):
# [B, T, C]
xf = torch.fft.rfft(x, dim=1)
# find period by amplitudes
frequency_list = abs(xf).mean(0).mean(-1)
frequency_list[0] = 0
_, top_list = torch.topk(frequency_list, k)
top_list = top_list.detach().cpu().numpy()
period = x.shape[1] // top_list
return period, abs(xf).mean(-1)[:, top_list]
In this experiment, top_k=3
Namespace(activation='gelu', anomaly_ratio=0.25, batch_size=16, c_out=7, checkpoints='./checkpoints/', d_ff=64, d_layers=1, d_model=64, data='UEA', data_path='ETTh1.csv', dec_in=7, des='Exp', devices='0,1,2,3', distil=True, dropout=0.1, e_layers=2, embed='timeF', enc_in=7, factor=1, features='M', freq='h', gpu=0, is_training=1, itr=1, label_len=48, learning_rate=0.001, loss='MSE', lradj='type1', mask_rate=0.25, model='TimesNet', model_id='JapaneseVowels', moving_avg=25, n_heads=8, num_kernels=6, num_workers=10, output_attention=False, p_hidden_dims=[128, 128], p_hidden_layers=2, patience=10, pred_len=96, root_path='./dataset/JapaneseVowels/', seasonal_patterns='Monthly', seq_len=96, target='OT', task_name='classification', top_k=3, train_epochs=30, use_amp=False, use_gpu=True, use_multi_gpu=False
Compute the FFT:
def FFT_for_Period(x, k=2):
# [B, T, C]
print('x',x.shape)
xf = torch.fft.rfft(x, dim=1)
# find period by amplitudes
frequency_list = abs(xf).mean(0).mean(-1)
frequency_list[0] = 0
_, top_list = torch.topk(frequency_list, k)
top_list = top_list.detach().cpu().numpy()
print('x',x.shape)
period = x.shape[1] // top_list
print('xshape',x.shape[1])
print('period',period)
print('pe',period.shape)
return period, abs(xf).mean(-1)[:, top_list]
Explanation of the FFT code:
The FFT algorithm is used to compute the frequency domain representation of a given sequence. For an input sequence with a length of N, the FFT transformed result contains N/2+1 complex values, which represent the amplitude and phase information of different frequency components in the input sequence. In this function, the importance of each frequency component is estimated by calculating its average amplitude, and the highest k frequency components are selected as the estimate of the period.
In the code, the length of the input sequence x is 29. The function calculates the FFT transformation result xf of x, and then calculates the average magnitude of each frequency component. Since the length of the input sequence is 29, there are 15 complex values (that is, N/2+1) in the FFT transformation result, corresponding to the frequency components from 0 to 14, where 0 represents the DC component. When calculating the average magnitude of the frequency components, the function ignores the DC component (i.e. the first complex value), so a frequency_list vector of length 14 is obtained.
The purpose of abs(xf).mean(0).mean(-1) is to calculate the mean magnitude of the frequency components in order to find the most significant frequencies in the sequence. Specifically, abs(xf) computes the magnitude of the complex Fourier transform, then averages over dim=0 to get the average magnitude of each frequency component. Next, average over dim=-1 to get the average magnitude at each time step. The end result is a tensor of shape [T // 2 + 1], where each element represents the average magnitude of the corresponding frequency component.
top_list is an array of integers with shape [k] representing the indices of the k frequency components with the highest values in frequency_list. These indexes can be obtained through the torch.topk function. Each element in top_list is an integer representing the index of the corresponding frequency component in frequency_list.
period is an integer tensor with shape [B, k], where each element represents a period corresponding to a frequency component in top_list. This period is calculated as the input sequence length divided by the corresponding frequency component index value. For example, if top_list[0] has a value of 2, then period[0, 0] will be the period of the input sequence, which is T // 2. Note that the integer division here uses the // operator.
In the Fourier transform, a time-domain signal can be represented as a superposition of sine and cosine functions of different frequencies. In the real fast Fourier transform, the Fourier transform result of a time-domain signal contains the corresponding frequency components and the corresponding amplitude of each frequency component. For real signals, its Fourier transform is symmetric, so only the first half of the transform result needs to be considered (usually T / / 2 + 1 T//2+1T//2+1 frequency component).
In the abs(xf).mean(-1)[:, top_list] expression, the operation on the Fourier transform result selects the magnitudes of a set of frequency components at each time step. These frequency components are usually components with higher frequencies in the input signal, and can be used to describe the periodicity in the input signal. For example, if the input signal has a frequencyffThe periodic pattern of f , then in the Fourier transform result, there will be a frequencyffThe peak value of f , and in the abs(xf).mean(-1)[:, top_list] expression, the amplitude corresponding to the peak value will be selected as one of the frequency components of the time step. The different frequency components may be different for all time steps, depending on the characteristics of the input signal.
Specifically, in the real fast Fourier transform, the Fourier transform result of the input signal containsT / / 2 + 1 T//2+1T//2+1 frequency component, corresponding to0 00 Hz、 1 / T 1/T 1/T Hz、 2 / T 2/T 2/T Hz、 … \dots …、 ( T / / 2 ) / T (T//2)/T ( T //2 ) / T Hz frequency. The magnitude of these frequency components represents the energy or weight of the input signal at the corresponding frequency. In the abs(xf).mean(-1)[:, top_list] expression, in order to find the periodicity of the input signal, we choose the most representativekkThere are k frequency components, and the amplitudes of these frequency components can be used to represent the periodic characteristics of the input signal. Therefore, for each time step, we can analyze its periodicity in terms of the magnitude of its frequency components.
The output period
is found to be 29, 14, 9, that is, the period of three timings (corresponding to the entire timing, half of the timing and one-third of the timing); the output is abs(xf).mean(-1)[:, top_list]
found to be an array [batch_size, top_list], which is [16, 3].
Each element represents the amplitude of the frequency component, so this result represents the amplitude of the signal whose circumference is (29, 14, 9), that is, the frequency is (1Hz, 2Hz, 3Hz).
time step
If the time steps are different, why the shape of his output is only [B,k], but does not reflect the different time steps? Not [B,29,k]?
Although in the Fourier transform, the frequency components of each time step are different, but in the abs(xf).mean(-1)[:, top_list] expression, we kk for each time step is chosenThe magnitudes of the k most representative frequency components are taken as output without retaining the magnitudes of all time steps. Therefore, the shape of the output tensor only reflectsthe kkThe amplitudes of the k most representative frequency components. If you want to preserve the magnitude of all time steps, the shape of the output tensor should be [B, T, k], where T is the length of the input sequence, but this will cause the size of the output tensor to become very large, which is not convenient for subsequent processing and analysis. In some cases, we may only be interested in the global periodicity of the input sequence, rather than the specific frequency content of each time step, so an output tensor of shape [B, k] may be sufficient.
These frequency components are determined by the frequency component distribution of the entire sequence and have nothing to do with the specific values between time steps.
Summary FFT
To sum up, two outputs, one period_list refers to frequency signals of different perimeters (29, 14, 9), and one period_weight refers to the frequency amplitudes of three different perimeter signals.
TimesBlock:
def forward(self, x):
print(x.shape)
B, T, N = x.size()
period_list, period_weight = FFT_for_Period(x, self.k)
print('period_list',period_list.shape)
print('period_weight',period_weight.shape)
res = []
for i in range(self.k):
period = period_list[i]
# padding
if (self.seq_len + self.pred_len) % period != 0:
length = (((self.seq_len + self.pred_len) // period) + 1) * period
padding = torch.zeros([x.shape[0], (length - (self.seq_len + self.pred_len)), x.shape[2]]).to(x.device)
out = torch.cat([x, padding], dim=1)
else:
length = (self.seq_len + self.pred_len)
out = x
# reshape
print('out-reshape-before',out.shape)
out = out.reshape(B, length // period, period,
N).permute(0, 3, 1, 2).contiguous()
print('out-reshape-after',out.shape)
# 2D conv: from 1d Variation to 2d Variation
out = self.conv(out)
# reshape back
out = out.permute(0, 2, 3, 1).reshape(B, -1, N)
print('out',out.shape)
res.append(out[:, :(self.seq_len + self.pred_len), :])
print('res',res.shape)
res = torch.stack(res, dim=-1)
print(res.shape)
# adaptive aggregation
period_weight = F.softmax(period_weight, dim=1)
period_weight = period_weight.unsqueeze(
1).unsqueeze(1).repeat(1, T, N, 1)
res = torch.sum(res * period_weight, -1)
# residual connection
res = res + x
return res
First, padding is performed on the three periods respectively:
for i in range(self.k):
period = period_list[i]
# padding
if (self.seq_len + self.pred_len) % period != 0:
length = (((self.seq_len + self.pred_len) // period) + 1) * period
padding = torch.zeros([x.shape[0], (length - (self.seq_len + self.pred_len)), x.shape[2]]).to(x.device)
out = torch.cat([x, padding], dim=1)
else:
length = (self.seq_len + self.pred_len)
out = x
# reshape
print('out-reshape-before',out.shape)
out = out.reshape(B, length // period, period,
N).permute(0, 3, 1, 2).contiguous()
print('out-reshape-after',out.shape)
# 2D conv: from 1d Variation to 2d Variation
out = self.conv(out)
# reshape back
out = out.permute(0, 2, 3, 1).reshape(B, -1, N)
print('out',out.shape)
res.append(out[:, :(self.seq_len + self.pred_len), :])
padding followed by reshape
There is nothing to say about padding. You can see that the second period and the third period have become 42 and 36 respectively, which means they can divide 29, 14 and 9 respectively.
reshape
The main event came in reshape. You can see that after reshape, it becomes two-dimensional (four-dimensional, [B, length//period, period, N]
), analogous image: each two-dimensional tensor corresponds to a two-dimensional image, where N is the number of channels of the image, length//period is the height of the image, period is the width of the image
Specifically, the first step in this line of code is to reshape the out tensor into a shape of [B, length//period, period, N], where B is the batch size of the input sequence and length is the length of the input sequence after padding, period is the period length of the current period feature, and N is the number of channels of the input sequence. This reshape operation divides the input sequence into a series of periodic subsequences, each subsequence contains period time steps of data.
Next, this line of code transforms the dimensions of the tensor through the permute method, transforming it into a shape of [B, N, length//period, period]. The purpose of this transformation operation is to transpose the time and period dimensions of the input sequence and place them on the third and fourth dimensions of the tensor to facilitate the subsequent processing of the convolutional neural network.
Finally, since the permute method may cause the tensor to be stored in a discontinuous manner, this line of code uses the contiguous method to ensure that the tensor is stored in a continuous manner.
At this point, the one-dimensional input of the code is converted into two-dimensional input: [B, N, length//period, period]
, each two-dimensional tensor corresponds to a two-dimensional image, where N is the number of channels of the image, length//period is the height of the image, and period is the width of the image . In this experiment, the input shape is [16, 64, 4, 9].
out = self.conv(out)
# reshape back
out = out.permute(0, 2, 3, 1).reshape(B, -1, N)
print('out',out.shape)
res.append(out[:, :(self.seq_len + self.pred_len), :])
res = torch.stack(res, dim=-1)
self.conv = nn.Sequential(
Inception_Block_V1(configs.d_model, configs.d_ff,num_kernels=configs.num_kernels),
nn.GELU(),
Inception_Block_V1(configs.d_ff, configs.d_model,num_kernels=configs.num_kernels)
)
class Inception_Block_V1(nn.Module):
def __init__(self, in_channels, out_channels, num_kernels=6, init_weight=True):
super(Inception_Block_V1, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.num_kernels = num_kernels
kernels = []
for i in range(self.num_kernels):
kernels.append(nn.Conv2d(in_channels, out_channels, kernel_size=2 * i + 1, padding=i))
self.kernels = nn.ModuleList(kernels)
if init_weight:
self._initialize_weights()
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
def forward(self, x):
res_list = []
for i in range(self.num_kernels):
res_list.append(self.kernels[i](x))
res = torch.stack(res_list, dim=-1).mean(-1)
return res
After being processed by the convolutional neural network, the shape of the output data is [B, N, L//p, k], where B is the batch size of the input sequence, N is the feature number of the input sequence, and L is the length (including the filled part), p is the period length of the current period feature, and k is the number of convolution kernels of the convolutional neural network.
Afterwards, the dimensions of the tensor are transformed by the permute method, and the time step and cycle dimensions are transposed back to their original positions.
Transformed by the permute method, the dimensions of the tensor are rearranged into a shape of [B, L//p, k, N], where B is the batch size of the input sequence, and L is the length of the input sequence (including the padded part), p is the cycle length of the current cycle feature, k is the number of convolution kernels of the convolutional neural network, and N is the number of features of the input sequence.
In this shape, the first dimension represents the batch size of the input sequence, the second dimension represents the number of subsequences of the input sequence after periodic division, the third dimension represents the number of feature maps generated by the convolutional neural network, and the fourth dimension represents each The number of channels of the feature map (i.e. the number of features of the input sequence).
Then, use the reshape method to reshape the tensor into a shape of [B, -1, N], where -1 means to compress the remaining dimensions into one dimension for subsequent processing.
res.append(out[:, :(self.seq_len + self.pred_len), :])
The purpose of this selection operation is to remove the filled part and only keep the part of the input sequence and the predicted output.
From the analysis results, it can be seen that the last three channels are all intercepted [16,29,64]
; after the stack, it becomes [16,29,64,3]
. Among them, 29 is the length of time series data, and 64 is the embedding dimension after DataEmbedding (including embedding layers: value_embedding, position_embedding, and temporal_embedding).
after,
period_weight = F.softmax(period_weight, dim=1)
period_weight = period_weight.unsqueeze(
1).unsqueeze(1).repeat(1, T, N, 1)
res = torch.sum(res * period_weight, -1)
# residual connection
res = res + x
This part of the code first uses the softmax function to normalize the weights of each periodic feature so that their sum is 1. Then, this part of the code expands the weights of the periodic features to the same shape as the input sequence through a series of reshaping and broadcasting operations to facilitate subsequent calculations. Specifically, this part of the code first uses the unsqueeze method to expand the weight of the periodic feature into a shape of [B, 1, 1, k], and then uses the repeat method to copy it into a shape of [B, T, N, k], where T is the length of the input sequence and N is the number of features of the input sequence. The purpose of this expansion operation is to align the weights of periodic features with each time step and feature dimension of the input sequence for subsequent calculations.
Next, this part of the code uses dot multiplication to weight the predicted output res of the periodic feature and the weight of the periodic feature to obtain a weighted average predicted output. Specifically, this part of the code uses the torch.sum method to multiply res and period_weight and sum them on the last dimension to obtain a tensor with a shape of [B, T, N]. In this process, the weight of the periodic feature will perform a weighted average on the forecast output of the periodic feature to improve the representation ability of the forecast result.
Finally, this part of the code performs a residual connection between the weighted average prediction output res and the input sequence x to obtain the final prediction result. The purpose of this residual connection is to preserve the original information in the input sequence and add the predicted output of the periodic features to the original information to get more accurate prediction results.
classification
def classification(self, x_enc, x_mark_enc):
# embedding
enc_out = self.enc_embedding(x_enc, None) # [B,T,C]
print('enc_out_classification',enc_out.shape)
# TimesNet
for i in range(self.layer):
enc_out = self.layer_norm(self.model[i](enc_out))
# Output
# the output transformer encoder/decoder embeddings don't include non-linearity
output = self.act(enc_out)
output = self.dropout(output)
print('output_classification1',output.shape)
# zero-out padding embeddings
output = output * x_mark_enc.unsqueeze(-1)
print('output_classification2',output.shape)
# (batch_size, seq_length * d_model)
output = output.reshape(output.shape[0], -1)
print('output_classification3',output.shape)
output = self.projection(output) # (batch_size, num_classes)
print('output_classification4',output.shape)
return output
This code is used to classify the input sequence, which is to map the input sequence to a category label.
Specifically, this part of the code first inputs the input sequence x_enc and time information x_mark_enc into the data embedding layer enc_embedding respectively, and obtains a tensor enc_out with a shape of [B, T, C], where B is the batch size of the input sequence , T is the length of the input sequence, and C is the feature number of the input sequence. This part of the code then feeds enc_out into a series of normalized TimesNet models for feature extraction and representation learning. Among them, this part of the code uses a for loop to traverse each sub-module in the TimesNet model in turn, and uses the normalization layer to normalize the output of each sub-module. Through these processes, this part of the code can obtain a feature representation enc_out that has undergone multi-layer nonlinear transformation.
Next, this part of the code uses a fully connected layer projection to map enc_out to output class labels. Specifically, this part of the code first uses the activation function to perform nonlinear transformation on enc_out to enhance its representation ability. This part of the code then regularizes the transformed features using a dropout layer and transforms them into a tensor of shape [B, T * C] using the reshape method. Next, this part of the code uses the fully connected layer projection to map the transformed features into a category label, and obtains a tensor output with a shape of [B, num_classes], where num_classes is the number of output categories.
Finally, this part of the code returns the output as the final classification result.