Skip to content

Preprocessing

create_sequences(input_data, target_column, sequence_length)

Create sequences from the input data.

Parameters:

Name Type Description Default
input_data pd.DataFrame

Pandas dataframe of input data.

required
target_column str

Name of the column to predict.

required
sequence_length int

Length of the sequence.

required

Returns:

Type Description
List[Tuple[pd.DataFrame, float]]

List of sequences.

Source code in make_us_rich/pipelines/preprocessing/nodes.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def create_sequences(
    input_data: pd.DataFrame, 
    target_column: str, 
    sequence_length: int,
    ) -> List[Tuple[pd.DataFrame, float]]:
    """
    Create sequences from the input data.

    Parameters
    ----------
    input_data: pd.DataFrame
        Pandas dataframe of input data.
    target_column: str
        Name of the column to predict.
    sequence_length: int
        Length of the sequence.

    Returns
    -------
    List[Tuple[pd.DataFrame, float]]
        List of sequences.
    """
    sequences = []
    size = len(input_data)
    for i in range(size - sequence_length):
        sequence = input_data[i: i + sequence_length]
        label_position = i + sequence_length
        label = input_data.iloc[label_position][target_column]
        sequences.append([sequence, label])
    return sequences

extract_features_from_dataset(data)

Extract features from dataset.

Parameters:

Name Type Description Default
data pd.DataFrame

Market chart data.

required

Returns:

Type Description
pd.DataFrame

Pandas dataframe of features.

Source code in make_us_rich/pipelines/preprocessing/nodes.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def extract_features_from_dataset(data: pd.DataFrame) -> pd.DataFrame:
    """
    Extract features from dataset.

    Parameters
    ----------
    data: pd.DataFrame
        Market chart data.

    Returns
    -------
    pd.DataFrame
        Pandas dataframe of features.
    """
    rows = []
    for _, row in data.iterrows():
        row_data = dict(
            day_of_week=row["timestamp"].dayofweek,
            day_of_month=row["timestamp"].day,
            week_of_year=row["timestamp"].week,
            month_of_year=row["timestamp"].month,
            open=row["open"],
            high=row["high"],
            low=row["low"],
            close=row["close"],
            close_change=row["close"] - row["open"],
        )
        rows.append(row_data)
    return pd.DataFrame(rows)

scale_data(train_df, test_df, dir_path)

Scale data to have a mean of 0 and a standard deviation of 1.

Parameters:

Name Type Description Default
train_df pd.DataFrame

Training data.

required
test_df pd.DataFrame

Test data.

required
dir_path str

Directory path to save the scaler.

required

Returns:

Type Description
pd.DataFrame

Scaled training and test data.

Source code in make_us_rich/pipelines/preprocessing/nodes.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def scale_data(
    train_df: pd.DataFrame, test_df: pd.DataFrame, dir_path: str,
) -> pd.DataFrame:
    """
    Scale data to have a mean of 0 and a standard deviation of 1.

    Parameters
    ----------
    train_df: pd.DataFrame
        Training data.
    test_df: pd.DataFrame
        Test data.
    dir_path: str
        Directory path to save the scaler.

    Returns
    -------
    pd.DataFrame
        Scaled training and test data.
    """
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler = scaler.fit(train_df)

    scaled_train_df = pd.DataFrame(
        scaler.transform(train_df),
        index=train_df.index, 
        columns=train_df.columns,
    )
    scaled_test_df = pd.DataFrame(
        scaler.transform(test_df),
        index=test_df.index,
        columns=test_df.columns,
    )
    dump(scaler, open(f"{dir_path}/scaler.pkl", "wb"))
    return scaled_train_df, scaled_test_df

split_data(data)

Split data into training and test sets.

Parameters:

Name Type Description Default
data pd.DataFrame

Market chart data.

required

Returns:

Type Description
pd.DataFrame

Pandas dataframe of training and test data.

Source code in make_us_rich/pipelines/preprocessing/nodes.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def split_data(data: pd.DataFrame) -> pd.DataFrame:
    """
    Split data into training and test sets.

    Parameters
    ----------
    data: pd.DataFrame
        Market chart data.

    Returns
    -------
    pd.DataFrame
        Pandas dataframe of training and test data.
    """
    train_size = int(len(data) * 0.9)
    train_df, test_df = data[:train_size], data[train_size + 1:]
    return train_df, test_df

split_train_and_val_sequences(sequences, val_size)

Split sequences into training and validation sets.

Parameters:

Name Type Description Default
sequences List[Tuple[pd.DataFrame, float]]

List of sequences.

required
val_size float

Percentage of the data to use as validation.

required

Returns:

Type Description
Tuple[List[Tuple[pd.DataFrame, float]]]

Tuple of training and validation sequences.

Source code in make_us_rich/pipelines/preprocessing/nodes.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def split_train_and_val_sequences(
    sequences: List[Tuple[pd.DataFrame, float]],
    val_size: float,
) -> Tuple[List[Tuple[pd.DataFrame, float]]]:
    """
    Split sequences into training and validation sets.

    Parameters
    ----------
    sequences: List[Tuple[pd.DataFrame, float]]
        List of sequences.
    val_size: float
        Percentage of the data to use as validation.

    Returns
    -------
    Tuple[List[Tuple[pd.DataFrame, float]]]
        Tuple of training and validation sequences.
    """
    train_sequences, val_sequences = [], []
    for sequence, label in sequences:
        if len(train_sequences) < len(sequences) * (1 - val_size):
            train_sequences.append((sequence, label))
        else:
            val_sequences.append((sequence, label))
    return train_sequences, val_sequences

Last update: 2022-05-04