Back to Article
Customer Segmentation Notebook
Download Notebook

Open In Colab

Customer Segmentation Modelling

In [3]:
import pandas as pd

df=pd.read_csv('/content/blinkit_orders.csv')
In [8]:
head_row=df.head(10)
display(head_row)
order_id customer_id order_date promised_delivery_time actual_delivery_time delivery_status order_total payment_method delivery_partner_id store_id
0 1961864118 30065862 2024-07-17 08:34:01 2024-07-17 08:52:01 2024-07-17 08:47:01 On Time 3197.07 Cash 63230 4771
1 1549769649 9573071 2024-05-28 13:14:29 2024-05-28 13:25:29 2024-05-28 13:27:29 On Time 976.55 Cash 14983 7534
2 9185164487 45477575 2024-09-23 13:07:12 2024-09-23 13:25:12 2024-09-23 13:29:12 On Time 839.05 UPI 39859 9886
3 9644738826 88067569 2023-11-24 16:16:56 2023-11-24 16:34:56 2023-11-24 16:33:56 On Time 440.23 Card 61497 7917
4 5427684290 83298567 2023-11-20 05:00:39 2023-11-20 05:17:39 2023-11-20 05:18:39 On Time 2526.68 Cash 84315 2741
5 3265154092 43367112 2023-03-18 16:29:51 2023-03-18 16:49:51 2023-03-18 16:48:51 On Time 3161.43 UPI 554 3442
6 4898355547 13284996 2023-04-16 18:50:37 2023-04-16 19:01:37 2023-04-16 19:02:37 On Time 956.40 Card 14630 1318
7 6568151549 88866835 2024-03-31 06:26:48 2024-03-31 06:37:48 2024-03-31 06:39:48 On Time 905.47 Cash 67714 115
8 6006693867 24496983 2023-07-13 23:49:36 2023-07-14 00:02:36 2023-07-14 00:05:36 On Time 1371.17 Card 91362 9021
9 374186990 52215833 2023-08-09 01:17:30 2023-08-09 01:37:30 2023-08-09 01:44:30 Slightly Delayed 1601.19 Wallet 77203 7955
In [24]:
df_info=df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   order_id                5000 non-null   int64  
 1   customer_id             5000 non-null   int64  
 2   order_date              5000 non-null   object 
 3   promised_delivery_time  5000 non-null   object 
 4   actual_delivery_time    5000 non-null   object 
 5   delivery_status         5000 non-null   object 
 6   order_total             5000 non-null   float64
 7   payment_method          5000 non-null   object 
 8   delivery_partner_id     5000 non-null   int64  
 9   store_id                5000 non-null   int64  
dtypes: float64(1), int64(4), object(5)
memory usage: 390.8+ KB
In [27]:
display(df.isnull().sum())
order_id                  0
customer_id               0
order_date                0
promised_delivery_time    0
actual_delivery_time      0
delivery_status           0
order_total               0
payment_method            0
delivery_partner_id       0
store_id                  0
dtype: int64
In [26]:
#descriptive stats of the dataset
descriptive_stats=df.describe()
display(descriptive_stats)
order_id customer_id order_total delivery_partner_id store_id
count 5.000000e+03 5.000000e+03 5000.00000 5000.000000 5000.000000
mean 5.029129e+09 5.009685e+07 2201.86170 50050.318200 4999.689000
std 2.863533e+09 2.919082e+07 1303.02438 28802.276922 2886.089242
min 6.046500e+04 3.181300e+04 13.25000 43.000000 1.000000
25% 2.531421e+09 2.404314e+07 1086.21500 24928.500000 2509.250000
50% 5.074378e+09 4.997808e+07 2100.69000 50262.500000 4987.000000
75% 7.488579e+09 7.621215e+07 3156.88250 74478.250000 7500.750000
max 9.998298e+09 9.989390e+07 6721.46000 99968.000000 9995.000000
In [4]:
df.dtypes
0
order_id int64
customer_id int64
order_date object
promised_delivery_time object
actual_delivery_time object
delivery_status object
order_total float64
payment_method object
delivery_partner_id int64
store_id int64

In [31]:
#shape
display("Shape of DataFrame:", df.shape)
display("Data Types:", df.dtypes)
display("First 5 Rows:", df.head())
'Shape of DataFrame:'
(5000, 10)
'Data Types:'
order_id                    int64
customer_id                 int64
order_date                 object
promised_delivery_time     object
actual_delivery_time       object
delivery_status            object
order_total               float64
payment_method             object
delivery_partner_id         int64
store_id                    int64
dtype: object
'First 5 Rows:'
order_id customer_id order_date promised_delivery_time actual_delivery_time delivery_status order_total payment_method delivery_partner_id store_id
0 1961864118 30065862 2024-07-17 08:34:01 2024-07-17 08:52:01 2024-07-17 08:47:01 On Time 3197.07 Cash 63230 4771
1 1549769649 9573071 2024-05-28 13:14:29 2024-05-28 13:25:29 2024-05-28 13:27:29 On Time 976.55 Cash 14983 7534
2 9185164487 45477575 2024-09-23 13:07:12 2024-09-23 13:25:12 2024-09-23 13:29:12 On Time 839.05 UPI 39859 9886
3 9644738826 88067569 2023-11-24 16:16:56 2023-11-24 16:34:56 2023-11-24 16:33:56 On Time 440.23 Card 61497 7917
4 5427684290 83298567 2023-11-20 05:00:39 2023-11-20 05:17:39 2023-11-20 05:18:39 On Time 2526.68 Cash 84315 2741
In [5]:
#missing values
print('Missing Values:', df.isnull().sum(), sep="\n")
Missing Values:
order_id                  0
customer_id               0
order_date                0
promised_delivery_time    0
actual_delivery_time      0
delivery_status           0
order_total               0
payment_method            0
delivery_partner_id       0
store_id                  0
dtype: int64
order_id                  0
customer_id               0
order_date                0
promised_delivery_time    0
actual_delivery_time      0
delivery_status           0
order_total               0
payment_method            0
delivery_partner_id       0
store_id                  0
dtype: int64
In [38]:
# duplicates
print("Number of Duplicated Rows:", df.duplicated().sum())
Number of Duplicated Rows: 0
In [41]:
#checking anomalies
print("Descriptive Statistics:\n")

display(df.describe(include='all'))
Descriptive Statistics:
order_id customer_id order_date promised_delivery_time actual_delivery_time delivery_status order_total payment_method delivery_partner_id store_id
count 5.000000e+03 5.000000e+03 5000 5000 5000 5000 5000.00000 5000 5000.000000 5000.000000
unique NaN NaN 5000 4999 5000 3 NaN 4 NaN NaN
top NaN NaN 2024-07-17 08:34:01 2024-10-13 14:06:50 2024-07-17 08:47:01 On Time NaN Card NaN NaN
freq NaN NaN 1 2 1 3470 NaN 1285 NaN NaN
mean 5.029129e+09 5.009685e+07 NaN NaN NaN NaN 2201.86170 NaN 50050.318200 4999.689000
std 2.863533e+09 2.919082e+07 NaN NaN NaN NaN 1303.02438 NaN 28802.276922 2886.089242
min 6.046500e+04 3.181300e+04 NaN NaN NaN NaN 13.25000 NaN 43.000000 1.000000
25% 2.531421e+09 2.404314e+07 NaN NaN NaN NaN 1086.21500 NaN 24928.500000 2509.250000
50% 5.074378e+09 4.997808e+07 NaN NaN NaN NaN 2100.69000 NaN 50262.500000 4987.000000
75% 7.488579e+09 7.621215e+07 NaN NaN NaN NaN 3156.88250 NaN 74478.250000 7500.750000
max 9.998298e+09 9.989390e+07 NaN NaN NaN NaN 6721.46000 NaN 99968.000000 9995.000000
In [44]:
print("Categorical Column Summary:\n")

display(df.describe(include=['object']))
Categorical Column Summary:
order_date promised_delivery_time actual_delivery_time delivery_status payment_method
count 5000 5000 5000 5000 5000
unique 5000 4999 5000 3 4
top 2024-07-17 08:34:01 2024-10-13 14:06:50 2024-07-17 08:47:01 On Time Card
freq 1 2 1 3470 1285
In [6]:
if 'order_total' in df.columns:
    invalid_order_total = df[df['order_total'] <= 0]
    print("\n Rows with Invalid (≤0) Order Total:\n", invalid_order_total)

 Rows with Invalid (≤0) Order Total:
 Empty DataFrame
Columns: [order_id, customer_id, order_date, promised_delivery_time, actual_delivery_time, delivery_status, order_total, payment_method, delivery_partner_id, store_id]
Index: []
In [7]:
#inconsistent date. need to be rectified before proceeding. need to be in the same format in all columns.
#invalid rows can be removed
if {'promised_delivery_time', 'actual_delivery_time'}.issubset(df.columns):
    df['promised_delivery_time'] = pd.to_datetime(df['promised_delivery_time'], errors='coerce')
    df['actual_delivery_time'] = pd.to_datetime(df['actual_delivery_time'], errors='coerce')
    invalid_delivery = df[df['actual_delivery_time'] < df['order_date']]
    print("\nRows with Invalid Delivery Times (Before Order Date):\n", invalid_delivery)
/tmp/ipython-input-2044985845.py:4: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df['promised_delivery_time'] = pd.to_datetime(df['promised_delivery_time'], errors='coerce')
/tmp/ipython-input-2044985845.py:5: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df['actual_delivery_time'] = pd.to_datetime(df['actual_delivery_time'], errors='coerce')

Rows with Invalid Delivery Times (Before Order Date):
         order_id  customer_id      order_date promised_delivery_time  \
1281  2148242015     39093069  30-04-23 23:48    2023-01-05 00:04:00   
1563  7821942520     84468124  12-02-24 23:57    2024-02-13 00:12:00   
2933  3368317718     70867736  31-05-24 23:59    2024-01-06 00:17:00   

     actual_delivery_time        delivery_status  order_total payment_method  \
1281  2023-01-05 00:33:00  Significantly Delayed      3239.66            UPI   
1563  2024-02-13 00:13:00                On Time      2732.91         Wallet   
2933  2024-01-06 00:12:00                On Time       914.88            UPI   

      delivery_partner_id  store_id  
1281                13676      9904  
1563                11136      5805  
2933                96055       522  
In [43]:
#unique countsin each column
print("Unique Value Counts per Column:", df.nunique(), sep="\n")
Unique Value Counts per Column:
order_id                  5000
customer_id               2172
order_date                5000
promised_delivery_time    4999
actual_delivery_time      5000
delivery_status              3
order_total               4550
payment_method               4
delivery_partner_id       5000
store_id                  5000
dtype: int64