Data Preprocessing
A tutorial of how to do data preprocessing with scikit-learn.
- 1.Import library
- 2.Get the data
- 3.Check for missing values
- Convert categorical data into numbers
- Split data into train and test
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
car_sales = pd.read_csv("car-sales-data.csv")
car_sales.head()
car_sales.isna().sum()
car_sales.dropna(subset=["Price"],inplace=True)
car_sales.isna().sum()
car_sales
car_sales.isna().sum()
car_sales.info()
car_sales["Make"].fillna("missing",inplace=True)
car_sales["Colour"].fillna("missing",inplace=True)
car_sales["Odometer (KM)"].fillna(car_sales["Odometer (KM)"].median(),inplace=True)
car_sales["Doors"].fillna(4,inplace=True)
car_sales
car_sales.isna().sum()
car_sales_missing = pd.read_csv("car-sales-data.csv")
car_sales_missing
car_sales_missing.dropna(subset=["Price"],inplace=True)
car_sales_missing.isna().sum()
car_sales_missing.info()
If you are use scikit learn to fill missing value, then you have to Split data into X and y
X = car_sales_missing.drop("Price",axis=1)
y = car_sales_missing["Price"]
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
# Fill categorical values with 'missing' & numerical values with median
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="median")
# Define columns
cat_features = ["Make", "Colour"]
door_feature = ["Doors"]
num_features = ["Odometer (KM)"]
# Create an imputer (something that fills missing data)
imputer = ColumnTransformer([
("cat_imputer", cat_imputer, cat_features),
("door_imputer", door_imputer, door_feature),
("num_imputer", num_imputer, num_features)
])
filled_X = imputer.fit_transform(X)
filled_X
car_sales_filled = pd.DataFrame(filled_X,
columns=["Make", "Colour", "Doors", "Odometer (KM)"])
car_sales_filled.isna().sum()
car_sales_filled
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
one_hot,
categorical_features)],
remainder="passthrough")
# Fill train and test values separately
transformed_X = transformer.fit_transform(car_sales_filled)
# Check transformed and filled X_train
transformed_X.toarray()
from sklearn.model_selection import train_test_split
np.random.seed(2509)
X_train, X_test, y_train, y_test = train_test_split(transformed_X,
y,
test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
Now data is in write shape to fit into model
Happy coding and have a great time learning how to make machines smarter.