TimesNet code reading

main function./run.py

args = parser.parse_args()
args.use_gpu = True if torch.cuda.is_available() and args.use_gpu else False

if args.use_gpu and args.use_multi_gpu:
    args.dvices = args.devices.replace(' ', '')
    device_ids = args.devices.split(',')
    args.device_ids = [int(id_) for id_ in device_ids]
    args.gpu = args.device_ids[0]

print('Args in experiment:')
print(args)

if args.task_name == 'long_term_forecast':
    Exp = Exp_Long_Term_Forecast
elif args.task_name == 'short_term_forecast':
    Exp = Exp_Short_Term_Forecast
elif args.task_name == 'imputation':
    Exp = Exp_Imputation
elif args.task_name == 'anomaly_detection':
    Exp = Exp_Anomaly_Detection
elif args.task_name == 'classification':
    Exp = Exp_Classification
else:
    Exp = Exp_Long_Term_Forecast

if args.is_training:
    for ii in range(args.itr):
        # setting record of experiments
        setting = '{
    
    }_{
    
    }_{
    
    }_{
    
    }_ft{
    
    }_sl{
    
    }_ll{
    
    }_pl{
    
    }_dm{
    
    }_nh{
    
    }_el{
    
    }_dl{
    
    }_df{
    
    }_fc{
    
    }_eb{
    
    }_dt{
    
    }_{
    
    }_{
    
    }'.format(
            args.task_name,
            args.model_id,
            args.model,
            args.data,
            args.features,
            args.seq_len,
            args.label_len,
            args.pred_len,
            args.d_model,
            args.n_heads,
            args.e_layers,
            args.d_layers,
            args.d_ff,
            args.factor,
            args.embed,
            args.distil,
            args.des, ii)

        exp = Exp(args)  # set experiments
        print('>>>>>>>start training : {
    
    }>>>>>>>>>>>>>>>>>>>>>>>>>>'.format(setting))
        exp.train(setting)

        print('>>>>>>>testing : {
    
    }<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.format(setting))
        exp.test(setting)
        torch.cuda.empty_cache()

Look at this sentence first exp = Exp(args).

Data read./data_provider/data_loader.py

from

./run.py

call Exp类, enter

./exp/exp_classification.py

; Here, train_data, train_loader = self._get_data(flag='TRAIN') test_data, test_loader = self._get_data(flag='TEST')the training set and test set are first read once, the purpose is to initialize the network structure. It will be read again during training later:

class Exp_Classification(Exp_Basic):
    def __init__(self, args):
        super(Exp_Classification, self).__init__(args)

    def _build_model(self):
        # model input depends on data
        train_data, train_loader = self._get_data(flag='TRAIN')
        test_data, test_loader = self._get_data(flag='TEST')
        self.args.seq_len = max(train_data.max_seq_len, test_data.max_seq_len)
        self.args.pred_len = 0
        self.args.enc_in = train_data.feature_df.shape[1]
        self.args.num_class = len(train_data.class_names)
        # model init
        model = self.model_dict[self.args.model].Model(self.args).float()
        if self.args.use_multi_gpu and self.args.use_gpu:
            model = nn.DataParallel(model, device_ids=self.args.device_ids)
        return model

    def _get_data(self, flag):
        data_set, data_loader = data_provider(self.args, flag)
        return data_set, data_loader

read data, enter first

./data_provider/data_factory.py

, it can be found that the UEAloader is called, located in

./data_provider/data_loader.py

class UEAloader(Dataset):
    """
    Dataset class for datasets included in:
        Time Series Classification Archive (www.timeseriesclassification.com)
    Argument:
        limit_size: float in (0, 1) for debug
    Attributes:
        all_df: (num_samples * seq_len, num_columns) dataframe indexed by integer indices, with multiple rows corresponding to the same index (sample).
            Each row is a time step; Each column contains either metadata (e.g. timestamp) or a feature.
        feature_df: (num_samples * seq_len, feat_dim) dataframe; contains the subset of columns of `all_df` which correspond to selected features
        feature_names: names of columns contained in `feature_df` (same as feature_df.columns)
        all_IDs: (num_samples,) series of IDs contained in `all_df`/`feature_df` (same as all_df.index.unique() )
        labels_df: (num_samples, num_labels) pd.DataFrame of label(s) for each sample
        max_seq_len: maximum sequence (time series) length. If None, script argument `max_seq_len` will be used.
            (Moreover, script argument overrides this attribute)
    """

    def __init__(self, root_path, file_list=None, limit_size=None, flag=None):
        self.root_path = root_path
        self.all_df, self.labels_df = self.load_all(root_path, file_list=file_list, flag=flag)
        self.all_IDs = self.all_df.index.unique()  # all sample IDs (integer indices 0 ... num_samples-1)

        if limit_size is not None:
            if limit_size > 1:
                limit_size = int(limit_size)
            else:  # interpret as proportion if in (0, 1]
                limit_size = int(limit_size * len(self.all_IDs))
            self.all_IDs = self.all_IDs[:limit_size]
            self.all_df = self.all_df.loc[self.all_IDs]

        # use all features
        self.feature_names = self.all_df.columns
        self.feature_df = self.all_df

        # pre_process
        normalizer = Normalizer()
        self.feature_df = normalizer.normalize(self.feature_df)
        # print(len(self.all_IDs))

    def load_all(self, root_path, file_list=None, flag=None):
        """
        Loads datasets from csv files contained in `root_path` into a dataframe, optionally choosing from `pattern`
        Args:
            root_path: directory containing all individual .csv files
            file_list: optionally, provide a list of file paths within `root_path` to consider.
                Otherwise, entire `root_path` contents will be used.
        Returns:
            all_df: a single (possibly concatenated) dataframe with all data corresponding to specified files
            labels_df: dataframe containing label(s) for each sample
        """
        # Select paths for training and evaluation
        if file_list is None:
            data_paths = glob.glob(os.path.join(root_path, '*'))  # list of all paths
        else:
            data_paths = [os.path.join(root_path, p) for p in file_list]
        if len(data_paths) == 0:
            raise Exception('No files found using: {}'.format(os.path.join(root_path, '*')))
        if flag is not None:
            data_paths = list(filter(lambda x: re.search(flag, x), data_paths))
        input_paths = [p for p in data_paths if os.path.isfile(p) and p.endswith('.ts')]
        if len(input_paths) == 0:
            raise Exception("No .ts files found using pattern: '{}'".format(pattern))

        all_df, labels_df = self.load_single(input_paths[0])  # a single file contains dataset

        return all_df, labels_df

    def load_single(self, filepath):
        df, labels = load_data.load_from_tsfile_to_dataframe(filepath, return_separate_X_and_y=True,
                                                             replace_missing_vals_with='NaN')
        labels = pd.Series(labels, dtype="category")
        self.class_names = labels.cat.categories
        labels_df = pd.DataFrame(labels.cat.codes,
                                 dtype=np.int8)  # int8-32 gives an error when using nn.CrossEntropyLoss

        lengths = df.applymap(
            lambda x: len(x)).values  # (num_samples, num_dimensions) array containing the length of each series

        horiz_diffs = np.abs(lengths - np.expand_dims(lengths[:, 0], -1))
        if np.sum(horiz_diffs) > 0:  # if any row (sample) has varying length across dimensions
            df = df.applymap(subsample)

        lengths = df.applymap(lambda x: len(x)).values
        vert_diffs = np.abs(lengths - np.expand_dims(lengths[0, :], 0))
        if np.sum(vert_diffs) > 0:  # if any column (dimension) has varying length across samples
            self.max_seq_len = int(np.max(lengths[:, 0]))
        else:
            self.max_seq_len = lengths[0, 0]


        df = pd.concat((pd.DataFrame({
    
    col: df.loc[row, col] for col in df.columns}).reset_index(drop=True).set_index(
            pd.Series(lengths[row, 0] * [row])) for row in range(df.shape[0])), axis=0)

        # Replace NaN values
        grp = df.groupby(by=df.index)
        df = grp.transform(interpolate_missing)

        return df, labels_df

Network training and inference./exp/exp_classification

class Exp_Classification(Exp_Basic):
    def __init__(self, args):
        super(Exp_Classification, self).__init__(args)

    def _build_model(self):
        # model input depends on data
        train_data, train_loader = self._get_data(flag='TRAIN')
        test_data, test_loader = self._get_data(flag='TEST')
        self.args.seq_len = max(train_data.max_seq_len, test_data.max_seq_len)
        self.args.pred_len = 0
        self.args.enc_in = train_data.feature_df.shape[1]
        self.args.num_class = len(train_data.class_names)
        # model init
        model = self.model_dict[self.args.model].Model(self.args).float()
        if self.args.use_multi_gpu and self.args.use_gpu:
            model = nn.DataParallel(model, device_ids=self.args.device_ids)
        return model

    def _get_data(self, flag):
        data_set, data_loader = data_provider(self.args, flag)
        return data_set, data_loader

After reading the data, set the network structure parameters according to the data, and then initialize the model self.model_dict[self.args.model].Model(self.args).float()
where `model_dict is in

./exp/exp_classification.py defines

definition:

from models import Autoformer, Transformer, TimesNet, Nonstationary_Transformer, DLinear, FEDformer, \
    Informer, LightTS, Reformer, ETSformer, Pyraformer, PatchTST, MICN, Crossformer
class Exp_Basic(object):
    def __init__(self, args):
        self.args = args
        self.model_dict = {
    
    
            'TimesNet': TimesNet,
            'Autoformer': Autoformer,
            'Transformer': Transformer,
            'Nonstationary_Transformer': Nonstationary_Transformer,
            'DLinear': DLinear,
            'FEDformer': FEDformer,
            'Informer': Informer,
            'LightTS': LightTS,
            'Reformer': Reformer,
            'ETSformer': ETSformer,
            'PatchTST': PatchTST,
            'Pyraformer': Pyraformer,
            'MICN': MICN,
            'Crossformer': Crossformer,
        }
        self.device = self._acquire_device()
        self.model = self._build_model().to(self.device)

    def _build_model(self):
        raise NotImplementedError
        return None

so here it comes

./models/TimesNet.py

The model function:

class Model(nn.Module):
    """
    Paper link: https://openreview.net/pdf?id=ju_Uqw384Oq
    """

    def __init__(self, configs):
        super(Model, self).__init__()
        self.configs = configs
        self.task_name = configs.task_name
        self.seq_len = configs.seq_len
        self.label_len = configs.label_len
        self.pred_len = configs.pred_len
        self.model = nn.ModuleList([TimesBlock(configs)
                                    for _ in range(configs.e_layers)])
        self.enc_embedding = DataEmbedding(configs.enc_in, configs.d_model, configs.embed, configs.freq,
                                           configs.dropout)
        self.layer = configs.e_layers
        self.layer_norm = nn.LayerNorm(configs.d_model)
        if self.task_name == 'long_term_forecast' or self.task_name == 'short_term_forecast':
            self.predict_linear = nn.Linear(
                self.seq_len, self.pred_len + self.seq_len)
            self.projection = nn.Linear(
                configs.d_model, configs.c_out, bias=True)
        if self.task_name == 'imputation' or self.task_name == 'anomaly_detection':
            self.projection = nn.Linear(
                configs.d_model, configs.c_out, bias=True)
        if self.task_name == 'classification':
            self.act = F.gelu
            self.dropout = nn.Dropout(configs.dropout)
            self.projection = nn.Linear(
                configs.d_model * configs.seq_len, configs.num_class)

Pay attention to this sentence:

        self.model = nn.ModuleList([TimesBlock(configs)
                                    for _ in range(configs.e_layers)])

It can be found that the network is composed of many TimesBlocks:

class TimesBlock(nn.Module):
    def __init__(self, configs):
        super(TimesBlock, self).__init__()
        self.seq_len = configs.seq_len
        self.pred_len = configs.pred_len
        self.k = configs.top_k
        # parameter-efficient design
        self.conv = nn.Sequential(
            Inception_Block_V1(configs.d_model, configs.d_ff,
                               num_kernels=configs.num_kernels),
            nn.GELU(),
            Inception_Block_V1(configs.d_ff, configs.d_model,
                               num_kernels=configs.num_kernels)
        )

    def forward(self, x):
        print(x.shape)
        B, T, N = x.size()
        period_list, period_weight = FFT_for_Period(x, self.k)
        print('period_list',period_list.shape)
        print('period_weight',period_weight.shape)

        res = []
        for i in range(self.k):
            period = period_list[i]
            # padding
            if (self.seq_len + self.pred_len) % period != 0:
                length = (((self.seq_len + self.pred_len) // period) + 1) * period
                padding = torch.zeros([x.shape[0], (length - (self.seq_len + self.pred_len)), x.shape[2]]).to(x.device)
                out = torch.cat([x, padding], dim=1)
            else:
                length = (self.seq_len + self.pred_len)
                out = x
            # reshape
            print('out-reshape-before',out.shape)
            out = out.reshape(B, length // period, period,
                              N).permute(0, 3, 1, 2).contiguous()
            print('out-reshape-after',out.shape)
            # 2D conv: from 1d Variation to 2d Variation
            out = self.conv(out)
            # reshape back
            out = out.permute(0, 2, 3, 1).reshape(B, -1, N)
            print('out',out.shape)
            res.append(out[:, :(self.seq_len + self.pred_len), :])
            print('res',res.shape)
        res = torch.stack(res, dim=-1)
        print(res.shape)
        # adaptive aggregation
        period_weight = F.softmax(period_weight, dim=1)
        period_weight = period_weight.unsqueeze(
            1).unsqueeze(1).repeat(1, T, N, 1)
        res = torch.sum(res * period_weight, -1)
        # residual connection
        res = res + x
        return res

Network Model Structure Design

When batch_size is input, the shape of x is batch_x torch.Size([16, 29, 12]);

./exp/exp_classification.py

insert image description here

After entering self.model,
self.model = self._build_model().to(self.device)
model = self.model_dict[self.args.model].Model(self.args).float()

./model/TimesNet.py

class Model(nn.Module):
    """
    Paper link: https://openreview.net/pdf?id=ju_Uqw384Oq
    """

    def __init__(self, configs):
        super(Model, self).__init__()
        self.configs = configs
        self.task_name = configs.task_name
        self.seq_len = configs.seq_len
        self.label_len = configs.label_len
        self.pred_len = configs.pred_len
        self.model = nn.ModuleList([TimesBlock(configs)
                                    for _ in range(configs.e_layers)])
        self.enc_embedding = DataEmbedding(configs.enc_in, configs.d_model, configs.embed, configs.freq,
                                           configs.dropout)
        self.layer = configs.e_layers
        self.layer_norm = nn.LayerNorm(configs.d_model)
        if self.task_name == 'long_term_forecast' or self.task_name == 'short_term_forecast':
            self.predict_linear = nn.Linear(
                self.seq_len, self.pred_len + self.seq_len)
            self.projection = nn.Linear(
                configs.d_model, configs.c_out, bias=True)
        if self.task_name == 'imputation' or self.task_name == 'anomaly_detection':
            self.projection = nn.Linear(
                configs.d_model, configs.c_out, bias=True)
        if self.task_name == 'classification':
            self.act = F.gelu
            self.dropout = nn.Dropout(configs.dropout)
            self.projection = nn.Linear(
                configs.d_model * configs.seq_len, configs.num_class)

For that outputs = self.model(batch_x, padding_mask, None, None), the forward() function should be called directly:


    def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None):
        if self.task_name == 'long_term_forecast' or self.task_name == 'short_term_forecast':
            dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
            return dec_out[:, -self.pred_len:, :]  # [B, L, D]
        if self.task_name == 'imputation':
            dec_out = self.imputation(
                x_enc, x_mark_enc, x_dec, x_mark_dec, mask)
            return dec_out  # [B, L, D]
        if self.task_name == 'anomaly_detection':
            dec_out = self.anomaly_detection(x_enc)
            return dec_out  # [B, L, D]
        if self.task_name == 'classification':
            dec_out = self.classification(x_enc, x_mark_enc)
            return dec_out  # [B, N]
        return None

Print it in the forward function x_enc, or x_enc forward: torch.Size([16, 29, 12]);

    def classification(self, x_enc, x_mark_enc):
        # embedding
        enc_out = self.enc_embedding(x_enc, None)  # [B,T,C]
        # TimesNet
        for i in range(self.layer):
            enc_out = self.layer_norm(self.model[i](enc_out))

        # Output
        # the output transformer encoder/decoder embeddings don't include non-linearity
        output = self.act(enc_out)
        output = self.dropout(output)
        # zero-out padding embeddings
        output = output * x_mark_enc.unsqueeze(-1)
        # (batch_size, seq_length * d_model)
        output = output.reshape(output.shape[0], -1)
        output = self.projection(output)  # (batch_size, num_classes)
        return output
class DataEmbedding(nn.Module):
    def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
        super(DataEmbedding, self).__init__()

        self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
        self.position_embedding = PositionalEmbedding(d_model=d_model)
        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type,
                                                    freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
            d_model=d_model, embed_type=embed_type, freq=freq)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, x_mark):
        if x_mark is None:
            x = self.value_embedding(x) + self.position_embedding(x)
        else:
            x = self.value_embedding(
                x) + self.temporal_embedding(x_mark) + self.position_embedding(x)
        return self.dropout(x)

After embedding, it becomes enc_out_classification torch.Size([16, 29, 64]).
Among them, 16 is batch_size, 29 is length, and 12 is the number of channels (number of dimensions); that is, he changed from 12 channels to 64 channels.

FFT, frequency transform:

for i in range(self.layer):
            enc_out = self.layer_norm(self.model[i](enc_out))

Called self.model:

    def forward(self, x):
        print(x.shape)
        B, T, N = x.size()
        period_list, period_weight = FFT_for_Period(x, self.k)
        print('period_list',period_list.shape)
        print('period_weight',period_weight.shape)

        res = []
        for i in range(self.k):
            period = period_list[i]
            # padding
            if (self.seq_len + self.pred_len) % period != 0:
                length = (((self.seq_len + self.pred_len) // period) + 1) * period
                padding = torch.zeros([x.shape[0], (length - (self.seq_len + self.pred_len)), x.shape[2]]).to(x.device)
                out = torch.cat([x, padding], dim=1)
            else:
                length = (self.seq_len + self.pred_len)
                out = x
            # reshape
            print('out-reshape-before',out.shape)
            out = out.reshape(B, length // period, period,
                              N).permute(0, 3, 1, 2).contiguous()
            print('out-reshape-after',out.shape)
            # 2D conv: from 1d Variation to 2d Variation
            out = self.conv(out)
            # reshape back
            out = out.permute(0, 2, 3, 1).reshape(B, -1, N)
            print('out',out.shape)
            res.append(out[:, :(self.seq_len + self.pred_len), :])
            print('res',res.shape)
        res = torch.stack(res, dim=-1)
        print(res.shape)
        # adaptive aggregation
        period_weight = F.softmax(period_weight, dim=1)
        period_weight = period_weight.unsqueeze(
            1).unsqueeze(1).repeat(1, T, N, 1)
        res = torch.sum(res * period_weight, -1)
        # residual connection
        res = res + x
        return res

Among them, FFT_for_Periodthe function is:

def FFT_for_Period(x, k=2):
    # [B, T, C]
    xf = torch.fft.rfft(x, dim=1)
    # find period by amplitudes
    frequency_list = abs(xf).mean(0).mean(-1)
    frequency_list[0] = 0
    _, top_list = torch.topk(frequency_list, k)
    top_list = top_list.detach().cpu().numpy()
    period = x.shape[1] // top_list
    return period, abs(xf).mean(-1)[:, top_list]

In this experiment, top_k=3

Namespace(activation='gelu', anomaly_ratio=0.25, batch_size=16, c_out=7, checkpoints='./checkpoints/', d_ff=64, d_layers=1, d_model=64, data='UEA', data_path='ETTh1.csv', dec_in=7, des='Exp', devices='0,1,2,3', distil=True, dropout=0.1, e_layers=2, embed='timeF', enc_in=7, factor=1, features='M', freq='h', gpu=0, is_training=1, itr=1, label_len=48, learning_rate=0.001, loss='MSE', lradj='type1', mask_rate=0.25, model='TimesNet', model_id='JapaneseVowels', moving_avg=25, n_heads=8, num_kernels=6, num_workers=10, output_attention=False, p_hidden_dims=[128, 128], p_hidden_layers=2, patience=10, pred_len=96, root_path='./dataset/JapaneseVowels/', seasonal_patterns='Monthly', seq_len=96, target='OT', task_name='classification', top_k=3, train_epochs=30, use_amp=False, use_gpu=True, use_multi_gpu=False

Compute the FFT:

def FFT_for_Period(x, k=2):
    # [B, T, C]
    print('x',x.shape)
    xf = torch.fft.rfft(x, dim=1)
    # find period by amplitudes
    frequency_list = abs(xf).mean(0).mean(-1)
    frequency_list[0] = 0
    _, top_list = torch.topk(frequency_list, k)
    top_list = top_list.detach().cpu().numpy()
    print('x',x.shape)
    period = x.shape[1] // top_list
    print('xshape',x.shape[1])
    print('period',period)
    print('pe',period.shape)
    return period, abs(xf).mean(-1)[:, top_list]

insert image description here

Explanation of the FFT code:

The FFT algorithm is used to compute the frequency domain representation of a given sequence. For an input sequence with a length of N, the FFT transformed result contains N/2+1 complex values, which represent the amplitude and phase information of different frequency components in the input sequence. In this function, the importance of each frequency component is estimated by calculating its average amplitude, and the highest k frequency components are selected as the estimate of the period.
In the code, the length of the input sequence x is 29. The function calculates the FFT transformation result xf of x, and then calculates the average magnitude of each frequency component. Since the length of the input sequence is 29, there are 15 complex values ​​(that is, N/2+1) in the FFT transformation result, corresponding to the frequency components from 0 to 14, where 0 represents the DC component. When calculating the average magnitude of the frequency components, the function ignores the DC component (i.e. the first complex value), so a frequency_list vector of length 14 is obtained.
The purpose of abs(xf).mean(0).mean(-1) is to calculate the mean magnitude of the frequency components in order to find the most significant frequencies in the sequence. Specifically, abs(xf) computes the magnitude of the complex Fourier transform, then averages over dim=0 to get the average magnitude of each frequency component. Next, average over dim=-1 to get the average magnitude at each time step. The end result is a tensor of shape [T // 2 + 1], where each element represents the average magnitude of the corresponding frequency component.
top_list is an array of integers with shape [k] representing the indices of the k frequency components with the highest values ​​in frequency_list. These indexes can be obtained through the torch.topk function. Each element in top_list is an integer representing the index of the corresponding frequency component in frequency_list.
period is an integer tensor with shape [B, k], where each element represents a period corresponding to a frequency component in top_list. This period is calculated as the input sequence length divided by the corresponding frequency component index value. For example, if top_list[0] has a value of 2, then period[0, 0] will be the period of the input sequence, which is T // 2. Note that the integer division here uses the // operator.

In the Fourier transform, a time-domain signal can be represented as a superposition of sine and cosine functions of different frequencies. In the real fast Fourier transform, the Fourier transform result of a time-domain signal contains the corresponding frequency components and the corresponding amplitude of each frequency component. For real signals, its Fourier transform is symmetric, so only the first half of the transform result needs to be considered (usually T / / 2 + 1 T//2+1T//2+1 frequency component).
In the abs(xf).mean(-1)[:, top_list] expression, the operation on the Fourier transform result selects the magnitudes of a set of frequency components at each time step. These frequency components are usually components with higher frequencies in the input signal, and can be used to describe the periodicity in the input signal. For example, if the input signal has a frequencyffThe periodic pattern of f , then in the Fourier transform result, there will be a frequencyffThe peak value of f , and in the abs(xf).mean(-1)[:, top_list] expression, the amplitude corresponding to the peak value will be selected as one of the frequency components of the time step. The different frequency components may be different for all time steps, depending on the characteristics of the input signal.
Specifically, in the real fast Fourier transform, the Fourier transform result of the input signal containsT / / 2 + 1 T//2+1T//2+1 frequency component, corresponding to0 00 Hz、 1 / T 1/T 1/T Hz、 2 / T 2/T 2/T Hz、 … \dots ( T / / 2 ) / T (T//2)/T ( T //2 ) / T Hz frequency. The magnitude of these frequency components represents the energy or weight of the input signal at the corresponding frequency. In the abs(xf).mean(-1)[:, top_list] expression, in order to find the periodicity of the input signal, we choose the most representativekkThere are k frequency components, and the amplitudes of these frequency components can be used to represent the periodic characteristics of the input signal. Therefore, for each time step, we can analyze its periodicity in terms of the magnitude of its frequency components.

The output periodis found to be 29, 14, 9, that is, the period of three timings (corresponding to the entire timing, half of the timing and one-third of the timing); the output is abs(xf).mean(-1)[:, top_list]found to be an array [batch_size, top_list], which is [16, 3].

insert image description here

Each element represents the amplitude of the frequency component, so this result represents the amplitude of the signal whose circumference is (29, 14, 9), that is, the frequency is (1Hz, 2Hz, 3Hz).
insert image description here

time step

If the time steps are different, why the shape of his output is only [B,k], but does not reflect the different time steps? Not [B,29,k]?
Although in the Fourier transform, the frequency components of each time step are different, but in the abs(xf).mean(-1)[:, top_list] expression, we kk for each time step is chosenThe magnitudes of the k most representative frequency components are taken as output without retaining the magnitudes of all time steps. Therefore, the shape of the output tensor only reflectsthe kkThe amplitudes of the k most representative frequency components. If you want to preserve the magnitude of all time steps, the shape of the output tensor should be [B, T, k], where T is the length of the input sequence, but this will cause the size of the output tensor to become very large, which is not convenient for subsequent processing and analysis. In some cases, we may only be interested in the global periodicity of the input sequence, rather than the specific frequency content of each time step, so an output tensor of shape [B, k] may be sufficient.
These frequency components are determined by the frequency component distribution of the entire sequence and have nothing to do with the specific values ​​between time steps.

Summary FFT

To sum up, two outputs, one period_list refers to frequency signals of different perimeters (29, 14, 9), and one period_weight refers to the frequency amplitudes of three different perimeter signals.

TimesBlock:

def forward(self, x):
        print(x.shape)
        B, T, N = x.size()
        period_list, period_weight = FFT_for_Period(x, self.k)
        print('period_list',period_list.shape)
        print('period_weight',period_weight.shape)

        res = []
        for i in range(self.k):
            period = period_list[i]
            # padding
            if (self.seq_len + self.pred_len) % period != 0:
                length = (((self.seq_len + self.pred_len) // period) + 1) * period
                padding = torch.zeros([x.shape[0], (length - (self.seq_len + self.pred_len)), x.shape[2]]).to(x.device)
                out = torch.cat([x, padding], dim=1)
            else:
                length = (self.seq_len + self.pred_len)
                out = x
            # reshape
            print('out-reshape-before',out.shape)
            out = out.reshape(B, length // period, period,
                              N).permute(0, 3, 1, 2).contiguous()
            print('out-reshape-after',out.shape)
            # 2D conv: from 1d Variation to 2d Variation
            out = self.conv(out)
            # reshape back
            out = out.permute(0, 2, 3, 1).reshape(B, -1, N)
            print('out',out.shape)
            res.append(out[:, :(self.seq_len + self.pred_len), :])
            print('res',res.shape)
        res = torch.stack(res, dim=-1)
        print(res.shape)
        # adaptive aggregation
        period_weight = F.softmax(period_weight, dim=1)
        period_weight = period_weight.unsqueeze(
            1).unsqueeze(1).repeat(1, T, N, 1)
        res = torch.sum(res * period_weight, -1)
        # residual connection
        res = res + x
        return res

First, padding is performed on the three periods respectively:

for i in range(self.k):
            period = period_list[i]
            # padding
            if (self.seq_len + self.pred_len) % period != 0:
                length = (((self.seq_len + self.pred_len) // period) + 1) * period
                padding = torch.zeros([x.shape[0], (length - (self.seq_len + self.pred_len)), x.shape[2]]).to(x.device)
                out = torch.cat([x, padding], dim=1)
            else:
                length = (self.seq_len + self.pred_len)
                out = x
            # reshape
            print('out-reshape-before',out.shape)
            out = out.reshape(B, length // period, period,
                              N).permute(0, 3, 1, 2).contiguous()
            print('out-reshape-after',out.shape)
            # 2D conv: from 1d Variation to 2d Variation
            out = self.conv(out)
            # reshape back
            out = out.permute(0, 2, 3, 1).reshape(B, -1, N)
            print('out',out.shape)
            res.append(out[:, :(self.seq_len + self.pred_len), :])

padding followed by reshape
insert image description here

There is nothing to say about padding. You can see that the second period and the third period have become 42 and 36 respectively, which means they can divide 29, 14 and 9 respectively.

reshape

The main event came in reshape. You can see that after reshape, it becomes two-dimensional (four-dimensional, [B, length//period, period, N]), analogous image: each two-dimensional tensor corresponds to a two-dimensional image, where N is the number of channels of the image, length//period is the height of the image, period is the width of the image

Specifically, the first step in this line of code is to reshape the out tensor into a shape of [B, length//period, period, N], where B is the batch size of the input sequence and length is the length of the input sequence after padding, period is the period length of the current period feature, and N is the number of channels of the input sequence. This reshape operation divides the input sequence into a series of periodic subsequences, each subsequence contains period time steps of data.
Next, this line of code transforms the dimensions of the tensor through the permute method, transforming it into a shape of [B, N, length//period, period]. The purpose of this transformation operation is to transpose the time and period dimensions of the input sequence and place them on the third and fourth dimensions of the tensor to facilitate the subsequent processing of the convolutional neural network.
Finally, since the permute method may cause the tensor to be stored in a discontinuous manner, this line of code uses the contiguous method to ensure that the tensor is stored in a continuous manner.

At this point, the one-dimensional input of the code is converted into two-dimensional input: [B, N, length//period, period], each two-dimensional tensor corresponds to a two-dimensional image, where N is the number of channels of the image, length//period is the height of the image, and period is the width of the image . In this experiment, the input shape is [16, 64, 4, 9].

out = self.conv(out)
# reshape back
out = out.permute(0, 2, 3, 1).reshape(B, -1, N)
print('out',out.shape)
res.append(out[:, :(self.seq_len + self.pred_len), :])
res = torch.stack(res, dim=-1)
self.conv = nn.Sequential(
    Inception_Block_V1(configs.d_model, configs.d_ff,num_kernels=configs.num_kernels),
    nn.GELU(),
    Inception_Block_V1(configs.d_ff, configs.d_model,num_kernels=configs.num_kernels)
        )
class Inception_Block_V1(nn.Module):
    def __init__(self, in_channels, out_channels, num_kernels=6, init_weight=True):
        super(Inception_Block_V1, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.num_kernels = num_kernels
        kernels = []
        for i in range(self.num_kernels):
            kernels.append(nn.Conv2d(in_channels, out_channels, kernel_size=2 * i + 1, padding=i))
        self.kernels = nn.ModuleList(kernels)
        if init_weight:
            self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def forward(self, x):
        res_list = []
        for i in range(self.num_kernels):
            res_list.append(self.kernels[i](x))
        res = torch.stack(res_list, dim=-1).mean(-1)
        return res

After being processed by the convolutional neural network, the shape of the output data is [B, N, L//p, k], where B is the batch size of the input sequence, N is the feature number of the input sequence, and L is the length (including the filled part), p is the period length of the current period feature, and k is the number of convolution kernels of the convolutional neural network.

Afterwards, the dimensions of the tensor are transformed by the permute method, and the time step and cycle dimensions are transposed back to their original positions.

Transformed by the permute method, the dimensions of the tensor are rearranged into a shape of [B, L//p, k, N], where B is the batch size of the input sequence, and L is the length of the input sequence (including the padded part), p is the cycle length of the current cycle feature, k is the number of convolution kernels of the convolutional neural network, and N is the number of features of the input sequence.

In this shape, the first dimension represents the batch size of the input sequence, the second dimension represents the number of subsequences of the input sequence after periodic division, the third dimension represents the number of feature maps generated by the convolutional neural network, and the fourth dimension represents each The number of channels of the feature map (i.e. the number of features of the input sequence).

Then, use the reshape method to reshape the tensor into a shape of [B, -1, N], where -1 means to compress the remaining dimensions into one dimension for subsequent processing.

res.append(out[:, :(self.seq_len + self.pred_len), :])The purpose of this selection operation is to remove the filled part and only keep the part of the input sequence and the predicted output.

insert image description here
From the analysis results, it can be seen that the last three channels are all intercepted [16,29,64]; after the stack, it becomes [16,29,64,3]. Among them, 29 is the length of time series data, and 64 is the embedding dimension after DataEmbedding (including embedding layers: value_embedding, position_embedding, and temporal_embedding).

after,

period_weight = F.softmax(period_weight, dim=1)
        period_weight = period_weight.unsqueeze(
            1).unsqueeze(1).repeat(1, T, N, 1)
        res = torch.sum(res * period_weight, -1)
        # residual connection
        res = res + x

This part of the code first uses the softmax function to normalize the weights of each periodic feature so that their sum is 1. Then, this part of the code expands the weights of the periodic features to the same shape as the input sequence through a series of reshaping and broadcasting operations to facilitate subsequent calculations. Specifically, this part of the code first uses the unsqueeze method to expand the weight of the periodic feature into a shape of [B, 1, 1, k], and then uses the repeat method to copy it into a shape of [B, T, N, k], where T is the length of the input sequence and N is the number of features of the input sequence. The purpose of this expansion operation is to align the weights of periodic features with each time step and feature dimension of the input sequence for subsequent calculations.

Next, this part of the code uses dot multiplication to weight the predicted output res of the periodic feature and the weight of the periodic feature to obtain a weighted average predicted output. Specifically, this part of the code uses the torch.sum method to multiply res and period_weight and sum them on the last dimension to obtain a tensor with a shape of [B, T, N]. In this process, the weight of the periodic feature will perform a weighted average on the forecast output of the periodic feature to improve the representation ability of the forecast result.

Finally, this part of the code performs a residual connection between the weighted average prediction output res and the input sequence x to obtain the final prediction result. The purpose of this residual connection is to preserve the original information in the input sequence and add the predicted output of the periodic features to the original information to get more accurate prediction results.

classification

    def classification(self, x_enc, x_mark_enc):
        # embedding
        enc_out = self.enc_embedding(x_enc, None)  # [B,T,C]
        print('enc_out_classification',enc_out.shape)
        # TimesNet
        for i in range(self.layer):
            enc_out = self.layer_norm(self.model[i](enc_out))
        # Output
        # the output transformer encoder/decoder embeddings don't include non-linearity
        output = self.act(enc_out)
        output = self.dropout(output)
        print('output_classification1',output.shape)
        # zero-out padding embeddings
        output = output * x_mark_enc.unsqueeze(-1)
        print('output_classification2',output.shape)
        # (batch_size, seq_length * d_model)
        output = output.reshape(output.shape[0], -1)
        print('output_classification3',output.shape)
        output = self.projection(output)  # (batch_size, num_classes)
        print('output_classification4',output.shape)
        return output

This code is used to classify the input sequence, which is to map the input sequence to a category label.

Specifically, this part of the code first inputs the input sequence x_enc and time information x_mark_enc into the data embedding layer enc_embedding respectively, and obtains a tensor enc_out with a shape of [B, T, C], where B is the batch size of the input sequence , T is the length of the input sequence, and C is the feature number of the input sequence. This part of the code then feeds enc_out into a series of normalized TimesNet models for feature extraction and representation learning. Among them, this part of the code uses a for loop to traverse each sub-module in the TimesNet model in turn, and uses the normalization layer to normalize the output of each sub-module. Through these processes, this part of the code can obtain a feature representation enc_out that has undergone multi-layer nonlinear transformation.

Next, this part of the code uses a fully connected layer projection to map enc_out to output class labels. Specifically, this part of the code first uses the activation function to perform nonlinear transformation on enc_out to enhance its representation ability. This part of the code then regularizes the transformed features using a dropout layer and transforms them into a tensor of shape [B, T * C] using the reshape method. Next, this part of the code uses the fully connected layer projection to map the transformed features into a category label, and obtains a tensor output with a shape of [B, num_classes], where num_classes is the number of output categories.

Finally, this part of the code returns the output as the final classification result.

Guess you like

Origin blog.csdn.net/weixin_44907625/article/details/129763571