Since I'm still in the learning phase of data analysis (wild Python scholar), I found two datasets at the site kaggle to give myself a practice run.
preliminary
import pandas as pd
import os
import as plt
import numpy as np
from random import choice
Getting data
Here I downloaded two datasets the first one is about coffee sales and the second one is about the 1000 most popular bloggers on the site Instagram.
I'll start with the table of coffee sales, because I looked at the second table and really didn't have any eyebrows to go for it
# Read the files in the directory
directory = r'C:\Users\Admin\Desktop\demo\exercise'
files = (directory)
print(files)
['coffee_result.csv', '']
# Storage of documents
files_list = []
for file in files:
if ('.csv'):
directory_file = fr'{directory}\{file}'
files_list.append(directory_file)
print(files_list)
['C:\\Users\\Admin\\Desktop\\demo\\practice\\coffee_result.csv', 'C:\\Users\\Admin\\Desktop\\demo\\practice\\']
# Read the needed files
df = pd.read_csv(files_list[0])
View some essential information
()
df
<class ''>
RangeIndex: 1464 entries, 0 to 1463
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 date 1464 non-null object
1 datetime 1464 non-null object
2 cash_type 1464 non-null object
3 card 1375 non-null object
4 money 1464 non-null float64
5 coffee_name 1464 non-null object
dtypes: float64(1), object(5)
memory usage: 68.8+ KB
date | datetime | cash_type | card | money | coffee_name | |
---|---|---|---|---|---|---|
0 | 2024-03-01 | 2024-03-01 10:15:50.520 | card | ANON-0000-0000-0001 | 38.70 | Latte |
1 | 2024-03-01 | 2024-03-01 12:19:22.539 | card | ANON-0000-0000-0002 | 38.70 | Hot Chocolate |
2 | 2024-03-01 | 2024-03-01 12:20:18.089 | card | ANON-0000-0000-0002 | 38.70 | Hot Chocolate |
3 | 2024-03-01 | 2024-03-01 13:46:33.006 | card | ANON-0000-0000-0003 | 28.90 | Americano |
4 | 2024-03-01 | 2024-03-01 13:48:14.626 | card | ANON-0000-0000-0004 | 38.70 | Latte |
... | ... | ... | ... | ... | ... | ... |
1459 | 2024-09-05 | 2024-09-05 20:30:14.964 | card | ANON-0000-0000-0587 | 32.82 | Cappuccino |
1460 | 2024-09-05 | 2024-09-05 20:54:24.429 | card | ANON-0000-0000-0588 | 23.02 | Americano |
1461 | 2024-09-05 | 2024-09-05 20:55:31.429 | card | ANON-0000-0000-0588 | 32.82 | Cappuccino |
1462 | 2024-09-05 | 2024-09-05 21:26:28.836 | card | ANON-0000-0000-0040 | 27.92 | Americano with Milk |
1463 | 2024-09-05 | 2024-09-05 21:27:29.969 | card | ANON-0000-0000-0040 | 27.92 | Americano with Milk |
1464 rows × 6 columns
print(df['cash_type'].unique().tolist(),'\n',
len(df['card'].unique().tolist()),'\n',
df['coffee_name'].unique().tolist(),'\n',
len(df['coffee_name'].unique().tolist()))
['card', 'cash']
589
['Latte', 'Hot Chocolate', 'Americano', 'Americano with Milk', 'Cocoa', 'Cortado', 'Espresso', 'Cappuccino']
8
The information returned by info shows that there are some nulls in the card column, so I'll take care of the nulls.
df[df['card'].isnull()]
date | datetime | cash_type | card | money | coffee_name | |
---|---|---|---|---|---|---|
12 | 2024-03-02 | 2024-03-02 10:30:35.668 | cash | NaN | 40.0 | Latte |
18 | 2024-03-03 | 2024-03-03 10:10:43.981 | cash | NaN | 40.0 | Latte |
41 | 2024-03-06 | 2024-03-06 12:30:27.089 | cash | NaN | 35.0 | Americano with Milk |
46 | 2024-03-07 | 2024-03-07 10:08:58.945 | cash | NaN | 40.0 | Latte |
49 | 2024-03-07 | 2024-03-07 11:25:43.977 | cash | NaN | 40.0 | Latte |
... | ... | ... | ... | ... | ... | ... |
657 | 2024-05-31 | 2024-05-31 09:23:58.791 | cash | NaN | 39.0 | Latte |
677 | 2024-06-01 | 2024-06-01 20:54:59.267 | cash | NaN | 39.0 | Cocoa |
685 | 2024-06-02 | 2024-06-02 22:43:10.636 | cash | NaN | 34.0 | Americano with Milk |
691 | 2024-06-03 | 2024-06-03 21:42:51.734 | cash | NaN | 34.0 | Americano with Milk |
692 | 2024-06-03 | 2024-06-03 21:43:37.471 | cash | NaN | 34.0 | Americano with Milk |
89 rows × 6 columns
The null value is generated by the row corresponding to the column whose payment type is cash payment
df['card'] = df['card'].fillna("-1")
df['card'].isnull().any()
np.False_
Processing of data
In the information returned by info, I see that the value type of the date column is an object, so I'll turn it into a date type to facilitate my own subsequent operations.
print(type([1,'date']),type([1,'datetime']))
[1,'date']
<class 'str'> <class 'str'>
'2024-03-01'
# Adjust the date format to extract the month of each row of data
df['date'] = pd.to_datetime(df['date'])
df['datetime'] = pd.to_datetime(df['datetime'])
df['month'] = df['date'].
print(len(df['month'].unique()))
7
View monthly sales
Since there were only 5 days of data in September this month was not included in the analysis
# View monthly sales and amounts
df_six = df[df['month']!=9].copy()
month = df_six['month'].unique() # Separate out the months
month_sales = df_six.groupby('month')['money'].count()
month_sum = df_six.groupby('month')['money'].sum()
figure,axes = (1,2,figsize=[16,8])
("Month sales and sum",size=20)
ax1 = axes[0].bar(month,month_sales)
axes[0].set_xlabel('Month',size=16)
axes[0].set_ylabel('Count',size=16)
ax2 = axes[1].bar(month,month_sum)
axes[1].set_xlabel('Month',size=16)
axes[1].set_ylabel('Sum',size=16)
axes[0].bar_label(ax1,fmt="%d",label_type="center")
axes[1].bar_label(ax2,fmt="%d",label_type="center")
plt.subplots_adjust(wspace=0.5)
Statistics on the marketing of each coffee
Monthly marketing per coffee
nrows,ncols = 2,4
figure3,axes = (nrows,ncols,figsize=[16,8],sharex=True,sharey=True)
coffee_month_sales = df_six.groupby(['month','coffee_name'])['money'].sum().reset_index(name='sum')
coffee_names = coffee_month_sales['coffee_name'].unique().tolist()
for idx,coffee_name in enumerate(coffee_names):
x,y = divmod(idx,ncols)
coffee_data = coffee_month_sales[coffee_month_sales['coffee_name']==coffee_name]
bars = axes[x,y].bar(coffee_data['month'],coffee_data['sum'])
axes[x,y].bar_label(bars,fmt="%d",label_type="center")
subtitle = f"{coffee_name} {int(coffee_data['sum'].sum())}"
axes[x,y].set_title(subtitle)
axes[x,y].set_xlabel('month',size=16)
axes[x,y].set_ylabel('sum',size=16)
('coffee month sales',size=20)
plt.tight_layout()
plt.subplots_adjust(wspace=0.5)
See how many people are reached by different coffees and what percentage of the audience they represent
stati = df_six.groupby('coffee_name')['money'].count().reset_index(name='buyers')
stati.sort_values(by='buyers',ascending=True,inplace=True,ignore_index=True)
figure2,axes = (1,2,figsize=(16,8))
("Coffee audience number and proportion",size=20)
ax1 = axes[0].barh([:,0],[:,1])
axes[0].bar_label(ax1,fmt="%d",label_type="center")
axes[0].set_ylabel("Kind",size=16)
axes[0].set_xlabel("Sum",size=16)
axes[1].pie([:,1],labels=[:,0],autopct='%0.1f')
plt.subplots_adjust(wspace=0.5)
Statistics on the actual consumption of customers
cardholder = df_six[df_six['card']!='-1'].copy()
cardholder['tag'] = 1
(columns=['date','datetime','cash_type'],inplace=True)
cardholder['month_sum'] = ('card')['tag'].transform('sum')
active_buyer = ('card')['month_sum'].max().reset_index(name='buys')
active_buyer.sort_values(by='buys',inplace=True,ignore_index=True,ascending=False)
cardholder['money_sum'] = ('card')['money'].transform('sum')
money_sum = cardholder.drop_duplicates(subset='card',ignore_index=True).copy()
money_sum.drop(columns=['money','coffee_name','month','tag','month_sum'],inplace=True)
money_sum.sort_values(by='money_sum',inplace=True,ignore_index=True,ascending=False)
result = (active_buyer,money_sum)
print('Average of total amount spent:',result['money_sum'].mean(),'\n','
(10))
Average total amount spent: 75.29034111310592
card buys money_sum
0 anon-0000-0000-0012 96 2772.44
1 anon-0000-0000-0009 67 2343.98
2 anon-0000-0000-0141 44 1101.08
3 anon-0000-0000-0097 38 1189.34
4 anon-0000-0000-0040 30 910.12
5 anon-0000-0000-0003 27 744.04
6 anon-0000-0000-0001 17 646.14
7 anon-0000-0000-0134 13 470.76
8 anon-0000-0000-0024 12 422.26
9 anon-0000-0000-0059 12 337.00
As you can see by the printout this is considered to be the most active group of users
That's roughly the extent to do this, thanks for watching, and if there's any good ways to do this, comment in the comments section as well!