import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_excel('https://github.com/theleadio/datascience_demo/blob/master/social-ads-raw.xlsx?raw=true')
df.sample(5)
df.describe()
df.shape
# find out the unique values in 'size' and 'area' columns
print("size:", df['size'].unique(), "; area:", df['area'].unique())
df['is_large'] = df['size'].map({"small":0, "large":1})
area_dummies = pd.get_dummies(df['area'])
df1 = df.join(area_dummies)
df1.sample(3)
figs, subs = plt.subplots(1,3, figsize=(20, 7))
sns.scatterplot(x='google', y='sales', hue='size', data=df1, ax=subs[0])
sns.scatterplot(x='facebook', y='sales', hue='size', data=df1, ax=subs[1])
sns.scatterplot(x='instagram', y='sales', hue='size', data=df1, ax=subs[2]);
Findings:
figs, subs = plt.subplots(1,3, figsize=(20, 7))
sns.regplot(x='google', y='sales', data=df1, ax=subs[0])
sns.regplot(x='facebook', y='sales', data=df1, ax=subs[1])
sns.regplot(x='instagram', y='sales', data=df1, ax=subs[2]);
Findings:
# create input features and target output
google = df1[['google']]
facebook = df1[['facebook']]
instagram = df1[['instagram']]
y = df1['sales']
# create model
from sklearn.linear_model import LinearRegression
google_model = LinearRegression()
facebook_model = LinearRegression()
instagram_model = LinearRegression()
# fit data into model
google_model.fit(google, y)
facebook_model.fit(facebook, y)
instagram_model.fit(instagram, y)
print("Google - R square:", google_model.score(google,y), "; Intercept:", google_model.intercept_, "; Coefficient:", google_model.coef_)
print("Facebook - R square:", facebook_model.score(facebook,y), "; Intercept:", facebook_model.intercept_, "; Coefficient:", facebook_model.coef_)
print("Instagram - R square:", instagram_model.score(instagram,y), "; Intercept:", instagram_model.intercept_, "; Coefficient:", instagram_model.coef_)
google_raw_unit = round(google_model.intercept_ * 1000)
facebook_raw_unit = round(facebook_model.intercept_ * 1000)
instagram_raw_unit = round(instagram_model.intercept_ * 1000)
google_ads_unit = round(google_model.coef_[0] * 1000)
facebook_ads_unit = round(facebook_model.coef_[0] * 1000)
instagram_ads_unit = round(instagram_model.coef_[0] * 1000)
print(f"If we don't spend a single cent on ads, we will be able to sell: \nGoogle: {google_raw_unit} units\nFacebook: {facebook_raw_unit} units\nInstagram: {instagram_raw_unit} units")
print("\n")
print(f"For every thousand dollars that we spent on the each platform, we will be able to sell an additional of: \nGoogle: {google_ads_unit} units\nFacebook: {facebook_ads_unit} units\nInstagram: {instagram_ads_unit} units")
# formula for predicting sales unit: y = ax + c * 1000, a = coefficient, c = intercept
ads_amount = input("The amount that we are going to spend? ")
platform = input("On which platform? ")
int_ads_amount = int(ads_amount)
platform = platform.lower()
if platform == 'google':
total_units = ((0.04753664 * int_ads_amount / 1000) + 7.032593549127695)*1000
print(f"If we spend ${'{:,}'.format(int_ads_amount)} on {platform}, we will be able to sell {'{:,}'.format(round(total_units))} units in total, an additional of {'{:,}'.format(round(total_units-google_raw_unit))} units")
elif platform == 'facebook':
total_units = ((0.20249578 * int_ads_amount / 1000) + 9.311638095158283)*1000
print(f"If we spend ${'{:,}'.format(int_ads_amount)} on {platform}, we will be able to sell {'{:,}'.format(round(total_units))} units in total, an additional of {'{:,}'.format(round(total_units-facebook_raw_unit))} units.")
else:
total_units = ((0.0546931 * int_ads_amount / 1000) + 12.35140706927816)*1000
print(f"If we spend ${'{:,}'.format(int_ads_amount)} on {platform}, we will be able to sell {'{:,}'.format(round(total_units))} units in total, an additional of {'{:,}'.format(round(total_units-instagram_raw_unit))} units.")
Findings:
# use multiple features to improve accuracy
df1.columns
input1 = ['google', 'facebook', 'instagram', 'is_large', 'rural', 'suburban', 'urban']
input2 = ['google', 'facebook', 'instagram', 'rural', 'suburban', 'urban']
input3 = ['google', 'facebook', 'instagram', 'is_large']
input4 = ['google', 'facebook', 'is_large', 'rural', 'suburban', 'urban']
input5 = ['google', 'facebook', 'instagram']
input_list = [input1, input2, input3, input4, input5]
for x in input_list:
X = df1[x]
y = df1['sales']
model = LinearRegression()
model.fit(X,y)
print("R square:", model.score(X,y), "; Intercept:", model.intercept_, "; Coefficient:", model.coef_)
input1 that includes all the columns has the highest R-square value, so we will use features in input1 for the subsequent modeling.
For every $1,000 spent on these variables, we will be able to sell an additional of:
It seems like by marketing on Facebook, in a large market in an urban area, will provide the highest return.
However, the accuracy score for Facebook as per our previous isolated modeling is found to be half the time less accurate than Google.
final_input = input1
X = df1[final_input]
y = df1['sales']
model = LinearRegression()
model.fit(X,y)
predict1 = [50, 40, 10] # 'google', 'facebook', 'instagram'
predict2 = [60, 30, 10]
predict3 = [70, 20, 10]
predict4 = [80, 10, 10]
predict5 = [40, 50, 10]
predict6 = [30, 60, 10]
predict7 = [20, 70, 10]
predict8 = [10, 80, 10]
budget_allocation = [predict1, predict2, predict3, predict4, predict5, predict6, predict7, predict8]
for x in budget_allocation:
# select large market and urban area only
market_area = [1, 0, 0, 1] # 'is_large', 'rural', 'suburban', 'urban'
x.extend(market_area)
print(f"For {x}, the total predicted sales will be {'{:,}'.format(round(model.predict([x])[0] *1000))} units.")
The more budget spent on Facebook, the higher our return. However, due to its performance inconsistency, we can't put all our budgets into one single platform.
Hence the budget should be allocated to the other more consistent platform like Google as well.
My preferred budget allocation will be:
and they will be spent on a large market, in an urban area.
The predicted sales will be around 14,432 units sold out of this $100k budget.