Skip to content

Instantly share code, notes, and snippets.

@koppd
Last active October 16, 2023 15:55
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Embed
What would you like to do?
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
dataset = pd.read_csv("../../data/Karan/DataGenerationRaw/adult.csv")
dataset.drop(labels = ["education"], axis = 1, inplace = True)
dataset.dropna(how = "any", inplace = True)
Y_VAR = "income"
def preprocess_dataset(raw_dataset: pd.DataFrame):
one_hot_mapping = {}
label_encoded_versions = {}
dataset = raw_dataset.copy()
# One-hot encoding: Work class, marital status, relationship and race
one_hot_variables = ["workclass", "relationship", "race", "marital-status"]
scaler = StandardScaler()
label_encoder = LabelEncoder()
for col in one_hot_variables:
encoded = label_encoder.fit_transform(dataset[col])
enc_shape = encoded.shape
label_encoded_versions[col] = scaler.fit_transform(encoded.reshape(-1, 1)).reshape(enc_shape)
for col in one_hot_variables:
# One hot encode
oneHotEncoder = OneHotEncoder()
oneHotEncoder.fit(dataset[col].values.reshape(-1, 1))
resulting_columns = oneHotEncoder.get_feature_names_out([col])
one_hot_mapping[col] = resulting_columns[1:]
dataset[resulting_columns] = oneHotEncoder.transform(
dataset[col].values.reshape(-1, 1)).toarray().astype(int)
# Drop first column as it is not needed in one-hot representation
dataset = dataset.drop([resulting_columns[0]], axis = 1)
# Drop original column
dataset = dataset.drop([col], axis = 1)
# Frequency encoding: occupation, native-country
for col in ["occupation", "native-country"]:
freq_encoding = (dataset.groupby(col).size()) / len(dataset)
dataset.loc[:, col] = dataset.loc[:, col].apply(lambda x : freq_encoding[x])
# Standard Scaler: Age, Hours per week, Capital Gain, Capital Loss, fnlwgt
# scaler = StandardScaler()
# dataset[["age", "hours-per-week", "capital-gain", "capital-loss", "fnlwgt"]] = scaler.fit_transform(
# dataset[["age", "hours-per-week", "capital-gain", "capital-loss", "fnlwgt"]])
# Label Encoding: Gender, Income
dataset["gender"] = dataset["gender"].map({"Female": 0, "Male": 1}).astype(int)
dataset["income"] = dataset["income"].map({"<=50K": 0, ">50K": 1}).astype(int)
x_vars = dataset.columns[dataset.columns != Y_VAR]
dataset[x_vars] = scaler.fit_transform(dataset[x_vars])
return dataset, one_hot_mapping, label_encoded_versions
EXPORTED_DATASET = (preprocess_dataset, dataset, Y_VAR)
Sign in to join this conversation on GitHub.