Estimated reading time: 23 minutes
import pandas
df_hist = pandas.read_excel("soda_sales_historical_data.xlsx")
df_hist[:5]

df_hist[df_hist["Product"]=="11 Down"]

from pandas import DataFrame, get_dummies
categorical_columns = ['Product','Easter Included','Super Bowl Included', 
                       'Christmas Included', 'Other Holiday']
df_hist = get_dummies(df_hist, prefix={k:"dmy_%s"%k for k in categorical_columns},
                      columns = list(categorical_columns))
df_hist[:5]
Sales Cost Per Unit 4 Wk Avg Temp 4 Wk Avg Humidity Sales M-1 weeks Sales M-2 weeks Sales M-3 weeks Sales M-4 Weeks Sales M-5 weeks dmy_Product_11 Down ... dmy_Product_Koala Kola dmy_Product_Mr. Popper dmy_Product_Popsi Kola dmy_Easter Included_No dmy_Easter Included_Yes dmy_Super Bowl Included_No dmy_Super Bowl Included_Yes dmy_Christmas Included_No dmy_Christmas Included_Yes dmy_Other Holiday_No
0 51.9 1.6625 80.69 69.19 17.0 22.4 13.5 14.5 28.0 1 ... 0 0 0 1 0 1 0 0 1 1
1 55.8 2.2725 80.69 69.19 2.4 2.2 2.0 1.4 0.5 0 ... 0 0 0 1 0 1 0 0 1 1
2 3385.6 1.3475 80.69 69.19 301.8 188.8 101.4 81.6 213.8 0 ... 0 0 0 1 0 1 0 0 1 1
3 63.5 1.6600 80.69 69.19 73.8 69.4 72.8 75.4 57.4 0 ... 0 0 0 1 0 1 0 0 1 1
4 181.1 1.8725 80.69 69.19 23.1 22.6 22.1 19.9 23.2 0 ... 0 0 1 1 0 1 0 0 1 1

5 rows × 25 columns

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingRegressor
from sklearn import model_selection
experiments = {"Algorithm":["Ordinary Least Squares", "Regression Tree", 
                            "Big Random Forest", "Random Forest", 
                            "Bagging"], 
               "Objects" : [lambda : LinearRegression(), 
                            lambda : DecisionTreeRegressor(), 
                            lambda : RandomForestRegressor(n_estimators=100), 
                            lambda : RandomForestRegressor(), 
                            lambda : BaggingRegressor()], 
               "Predictions":[[] for _ in range(5)]}
actuals = []
from sklearn.model_selection import train_test_split
for _ in range (4):
    train_X, test_X, train_y, test_y = (
        train_test_split(df_hist.drop("Sales", axis=1), 
                         df_hist["Sales"], test_size=0.25))
    for i, obj_factory in enumerate(experiments["Objects"]):
        obj = obj_factory()
        obj.fit(y=train_y,X=train_X)
        experiments["Predictions"][i] += list(obj.predict(test_X))
    actuals += list(test_y)
actuals = pandas.Series(actuals)
experiments["Predictions"] = list(map(pandas.Series, experiments["Predictions"]))
experiments["Results"] = []
for o in experiments["Objects"]:
    experiments["Results"].append(
        model_selection.cross_val_score(o(), y=df_hist['Sales'], 
                                        X=df_hist.drop("Sales", axis=1),
                                        cv=5).mean())
DataFrame(experiments).drop(["Objects", "Predictions"], 
                            axis=1).set_index("Algorithm")

fitted = (experiments["Objects"]
          [experiments["Algorithm"].index("Big Random Forest")]().
          fit(y=df_hist["Sales"], X=df_hist.drop("Sales", axis=1)))
df_superbowl_original = pandas.read_excel("super_bowl_promotion_data.xlsx")
df_superbowl = get_dummies(df_superbowl_original, 
                           prefix={k:"dmy_%s"%k for k in categorical_columns},
                           columns = list(categorical_columns))
assert "Sales" not in df_superbowl.columns 
assert {"Sales"}.union(df_superbowl.columns).issubset(set(df_hist.columns))
len(df_superbowl)
36
for fld in set(df_hist.columns).difference(df_superbowl.columns, {"Sales"}):
    assert fld.startswith("dmy_")
    df_superbowl[fld] = 0
    
df_superbowl = df_superbowl[list(df_hist.drop("Sales", axis=1).columns)]

predicted = fitted.predict(df_superbowl)

from __future__ import print_function
from ortools.constraint_solver import pywrapcp as cp

soda_family = {'11 Down': 'Clear', 'AB Root Beer': 'Dark', 
               'Alpine Stream': 'Clear', 'Bright': 'Clear', 
               'Crisp Clear': 'Clear', 'DC Kola': 'Dark',
               'Koala Kola': 'Dark', 'Mr. Popper': 'Dark', 
               'Popsi Kola': 'Dark'}
family  = set(soda_family[j] for j in soda_family)
soda    = set(j for j in soda_family)
max_prom = {f:2 for f in family}
product_prices = set(forecast_sales.index.values)
normal_price = {b:0 for b in soda}
for b,p in product_prices:
    normal_price[b] = max(normal_price[b],p)
forecast_sales = df_superbowl_original[["Product", "Cost Per Unit"]].copy()
forecast_sales["Sales"] = predicted

forecast_sales = forecast_sales.groupby(['Product','Cost Per Unit']).mean()

normal_price_2 = dict(forecast_sales.reset_index()[["Product","Cost Per Unit"]].groupby("Product").max()["Cost Per Unit"])



maxer = forecast_sales.reset_index().groupby("Product").max()


## This is to get the sales value of the max cost

maxer["Sales"] = forecast_sales[forecast_sales.index.isin(maxer.reset_index().set_index(["Product","Cost Per Unit"]).index)]["Sales"].values
maxer
Cost Per Unit Sales
Product
11 Down 1.5600 173.626
AB Root Beer 3.8425 376.175
Alpine Stream 2.2275 156.269
Bright 1.2900 2656.430
Crisp Clear 1.4700 140.921
DC Kola 1.9325 501.308
Koala Kola 2.5650 1423.454
Mr. Popper 2.9850 38.421
Popsi Kola 1.7500 145.957
import pandas as pd
forecast_sales_r =forecast_sales.reset_index()
bat = pd.DataFrame()
for product in maxer.index:
    rat = forecast_sales_r[forecast_sales_r["Product"]==product]["Cost Per Unit"]-maxer.loc[product,"Cost Per Unit"]
    rat = rat.to_frame()
    bat = pd.concat((bat,rat))

forecast_sales_r["marginal_investment"] =  bat.values
import pandas as pd

bat = pd.DataFrame()
for product in maxer.index:
    rat = forecast_sales_r[forecast_sales_r["Product"]==product]["Sales"]-maxer.loc[product,"Sales"]
    rat = rat.to_frame()
    bat = pd.concat((bat,rat))
forecast_sales_r["extra_sales"] =  bat.values
forecast_sales_r
Product Cost Per Unit Sales marginal_investment extra_sales
0 11 Down 1.4550 248.407 -0.1050 74.781
1 11 Down 1.5125 239.704 -0.0475 66.078
2 11 Down 1.5375 211.602 -0.0225 37.976
3 11 Down 1.5600 173.626 0.0000 0.000
4 AB Root Beer 3.7300 384.838 -0.1125 8.663
5 AB Root Beer 3.7700 389.141 -0.0725 12.966
6 AB Root Beer 3.8125 379.363 -0.0300 3.188
7 AB Root Beer 3.8425 376.175 0.0000 0.000
8 Alpine Stream 1.9975 201.513 -0.2300 45.244
9 Alpine Stream 2.1375 174.656 -0.0900 18.387
10 Alpine Stream 2.1500 176.016 -0.0775 19.747
11 Alpine Stream 2.2275 156.269 0.0000 0.000
12 Bright 1.2725 3790.724 -0.0175 1134.294
13 Bright 1.2825 2802.034 -0.0075 145.604
14 Bright 1.2900 2656.430 0.0000 0.000
15 Crisp Clear 1.3125 184.459 -0.1575 43.538
16 Crisp Clear 1.3425 173.047 -0.1275 32.126
17 Crisp Clear 1.4275 138.466 -0.0425 -2.455
18 Crisp Clear 1.4700 140.921 0.0000 0.000
19 DC Kola 1.8900 936.894 -0.0425 435.586
20 DC Kola 1.9150 568.984 -0.0175 67.676
21 DC Kola 1.9250 512.329 -0.0075 11.021
22 DC Kola 1.9325 501.308 0.0000 0.000
23 Koala Kola 2.4825 1394.552 -0.0825 -28.902
24 Koala Kola 2.5350 1408.344 -0.0300 -15.110
25 Koala Kola 2.5600 1426.688 -0.0050 3.234
26 Koala Kola 2.5650 1423.454 0.0000 0.000
27 Mr. Popper 2.8475 41.408 -0.1375 2.987
28 Mr. Popper 2.8925 41.404 -0.0925 2.983
29 Mr. Popper 2.8950 41.404 -0.0900 2.983
30 Mr. Popper 2.9000 41.382 -0.0850 2.961
31 Mr. Popper 2.9850 38.421 0.0000 0.000
32 Popsi Kola 1.6725 156.135 -0.0775 10.178
33 Popsi Kola 1.7125 145.706 -0.0375 -0.251
34 Popsi Kola 1.7275 147.310 -0.0225 1.353
35 Popsi Kola 1.7500 145.957 0.0000 0.000
for j in range(num_tasks):
    ## This in effect sets constraints on the domain
    t.append(solver.IntVar(0, 1, "x[%i,%i]" % (i, j)))
forecast_sales = forecast_sales.reset_index()
forecast_sales
Product Cost Per Unit Sales net_margin
0 11 Down 1.4550 248.407 0
1 11 Down 1.5125 239.704 0
2 11 Down 1.5375 211.602 0
3 11 Down 1.5600 173.626 0
4 AB Root Beer 3.7300 384.838 0
5 AB Root Beer 3.7700 389.141 0
6 AB Root Beer 3.8125 379.363 0
7 AB Root Beer 3.8425 376.175 0
8 Alpine Stream 1.9975 201.513 0
9 Alpine Stream 2.1375 174.656 0
10 Alpine Stream 2.1500 176.016 0
11 Alpine Stream 2.2275 156.269 0
12 Bright 1.2725 3790.724 0
13 Bright 1.2825 2802.034 0
14 Bright 1.2900 2656.430 0
15 Crisp Clear 1.3125 184.459 0
16 Crisp Clear 1.3425 173.047 0
17 Crisp Clear 1.4275 138.466 0
18 Crisp Clear 1.4700 140.921 0
19 DC Kola 1.8900 936.894 0
20 DC Kola 1.9150 568.984 0
21 DC Kola 1.9250 512.329 0
22 DC Kola 1.9325 501.308 0
23 Koala Kola 2.4825 1394.552 0
24 Koala Kola 2.5350 1408.344 0
25 Koala Kola 2.5600 1426.688 0
26 Koala Kola 2.5650 1423.454 0
27 Mr. Popper 2.8475 41.408 0
28 Mr. Popper 2.8925 41.404 0
29 Mr. Popper 2.8950 41.404 0
30 Mr. Popper 2.9000 41.382 0
31 Mr. Popper 2.9850 38.421 0
32 Popsi Kola 1.6725 156.135 0
33 Popsi Kola 1.7125 145.706 0
34 Popsi Kola 1.7275 147.310 0
35 Popsi Kola 1.7500 145.957 0
from ortools.constraint_solver import pywrapcp as cp


solver = cp.Solver("nothing_fancy")
x_array = []

"""x_array["Product", "Price"]"""

x_product = []
for i, product in enumerate(forecast_sales_r.drop_duplicates("Product")["Product"].values):
    t = []
    for j, price in enumerate(forecast_sales_r[forecast_sales_r["Product"]==product]["Product"].values):
        ## This in effect sets constraints on the domain
        t.append(solver.IntVar(0, 1, "x[%i,%i]" % (i, j)))
    x_array.extend(t)
    x_product.append(t)
## More Specific Keys
x_array = []

"""x_array["Product", "Price"]"""

x_product = []
for i, product in enumerate(forecast_sales_r.drop_duplicates("Product")["Product"].values):
    t = []
    for j, price in enumerate(forecast_sales_r[forecast_sales_r["Product"]==product]["Cost Per Unit"].values):
        ## This in effect sets constraints on the domain
        t.append(solver.IntVar(0, 1, "x[%s,%s]" % (product, price)))
    x_array.extend(t) ## produces a flat array
    x_product.append(t)
x_product
[[x[11 Down,1.455](0 .. 1),
  x[11 Down,1.5125](0 .. 1),
  x[11 Down,1.5375](0 .. 1),
  x[11 Down,1.56](0 .. 1)],
 [x[AB Root Beer,3.73](0 .. 1),
  x[AB Root Beer,3.77](0 .. 1),
  x[AB Root Beer,3.8125](0 .. 1),
  x[AB Root Beer,3.8425](0 .. 1)],
 [x[Alpine Stream,1.9975](0 .. 1),
  x[Alpine Stream,2.1375](0 .. 1),
  x[Alpine Stream,2.15](0 .. 1),
  x[Alpine Stream,2.2275](0 .. 1)],
 [x[Bright,1.2725](0 .. 1), x[Bright,1.2825](0 .. 1), x[Bright,1.29](0 .. 1)],
 [x[Crisp Clear,1.3125](0 .. 1),
  x[Crisp Clear,1.3425](0 .. 1),
  x[Crisp Clear,1.4275](0 .. 1),
  x[Crisp Clear,1.47](0 .. 1)],
 [x[DC Kola,1.89](0 .. 1),
  x[DC Kola,1.915](0 .. 1),
  x[DC Kola,1.925](0 .. 1),
  x[DC Kola,1.9325](0 .. 1)],
 [x[Koala Kola,2.4825](0 .. 1),
  x[Koala Kola,2.535](0 .. 1),
  x[Koala Kola,2.56](0 .. 1),
  x[Koala Kola,2.565](0 .. 1)],
 [x[Mr. Popper,2.8475](0 .. 1),
  x[Mr. Popper,2.8925](0 .. 1),
  x[Mr. Popper,2.895](0 .. 1),
  x[Mr. Popper,2.9](0 .. 1),
  x[Mr. Popper,2.985](0 .. 1)],
 [x[Popsi Kola,1.6725](0 .. 1),
  x[Popsi Kola,1.7125](0 .. 1),
  x[Popsi Kola,1.7275](0 .. 1),
  x[Popsi Kola,1.75](0 .. 1)]]
forecast_sales_r["Extra Sales Dollars"] = forecast_sales_r["Cost Per Unit"]*forecast_sales_r["extra_sales"]

forecast_sales_r["Investment Dollars"] = forecast_sales_r["marginal_investment"]*forecast_sales_r["Sales"]
## I would change it to say largest sales dollars increase
## and then compare that against the investment 
## UNFORTUNATTELY - I Turned this into a knapsack problem 
forecast_sales_r["Investment Dollars"] = forecast_sales_r["Investment Dollars"].abs()
forecast_sales_r.head()
Product Cost Per Unit Sales marginal_investment extra_sales Extra Sales Dollars Investment Dollars
0 11 Down 1.4550 248.407 -0.1050 74.781 108.806355 26.082735
1 11 Down 1.5125 239.704 -0.0475 66.078 99.942975 11.385940
2 11 Down 1.5375 211.602 -0.0225 37.976 58.388100 4.761045
3 11 Down 1.5600 173.626 0.0000 0.000 0.000000 0.000000
4 AB Root Beer 3.7300 384.838 -0.1125 8.663 32.312990 43.294275
weights = []
weight_1 = list(forecast_sales_r["Investment Dollars"].values)

weights.append(weight_1)

capacities = [350]
values = list(forecast_sales_r["Extra Sales Dollars"].values)
from ortools.algorithms import pywrapknapsack_solver

# Create the solver.
solver = pywrapknapsack_solver.KnapsackSolver(
  pywrapknapsack_solver.KnapsackSolver.
  KNAPSACK_DYNAMIC_PROGRAMMING_SOLVER,
  'test')

solver.Init(values, weights, capacities)
computed_value = solver.Solve()

packed_items = [x for x in range(0, len(weights[0]))
                if solver.BestSolutionContains(x)]
packed_weights = [weights[0][i] for i in packed_items]

print("Packed items: ", packed_items)
print("Packed weights: ", packed_weights)
print("Total weight (same as total value): ", computed_value)
Packed items:  [0, 1, 2, 5, 8, 9, 10, 12, 13, 15, 16, 19, 20, 21, 27, 28, 29, 30, 34]
Packed weights:  [26.082735000000014, 11.385940000000028, 4.761044999999996, 28.212722499999938, 46.34799000000006, 15.71903999999999, 13.641240000000034, 66.33767000000027, 21.01525500000017, 29.052292500000004, 22.0634925, 39.817995000000145, 9.957220000000047, 3.842467500000032, 5.693599999999984, 3.829869999999989, 3.7263599999999912, 3.517469999999996, 3.314474999999995]
Total weight (same as total value):  3220
packed_items = [x for x in range(0, len(weights[0]))
                  if solver.BestSolutionContains(x)]

## This dataframe is the best selection, using the knapsack problem.
## Different costs differnt values 
## The knapsack problem does appear i a lot of places 
forecast_sales_r[forecast_sales_r.index.isin(packed_items)]
Product Cost Per Unit Sales marginal_investment extra_sales Extra Sales Dollars Investment Dollars
0 11 Down 1.4550 248.407 -0.1050 74.781 108.806355 26.082735
1 11 Down 1.5125 239.704 -0.0475 66.078 99.942975 11.385940
2 11 Down 1.5375 211.602 -0.0225 37.976 58.388100 4.761045
5 AB Root Beer 3.7700 389.141 -0.0725 12.966 48.881820 28.212722
8 Alpine Stream 1.9975 201.513 -0.2300 45.244 90.374890 46.347990
9 Alpine Stream 2.1375 174.656 -0.0900 18.387 39.302213 15.719040
10 Alpine Stream 2.1500 176.016 -0.0775 19.747 42.456050 13.641240
12 Bright 1.2725 3790.724 -0.0175 1134.294 1443.389115 66.337670
13 Bright 1.2825 2802.034 -0.0075 145.604 186.737130 21.015255
15 Crisp Clear 1.3125 184.459 -0.1575 43.538 57.143625 29.052293
16 Crisp Clear 1.3425 173.047 -0.1275 32.126 43.129155 22.063492
19 DC Kola 1.8900 936.894 -0.0425 435.586 823.257540 39.817995
20 DC Kola 1.9150 568.984 -0.0175 67.676 129.599540 9.957220
21 DC Kola 1.9250 512.329 -0.0075 11.021 21.215425 3.842468
27 Mr. Popper 2.8475 41.408 -0.1375 2.987 8.505482 5.693600
28 Mr. Popper 2.8925 41.404 -0.0925 2.983 8.628327 3.829870
29 Mr. Popper 2.8950 41.404 -0.0900 2.983 8.635785 3.726360
30 Mr. Popper 2.9000 41.382 -0.0850 2.961 8.586900 3.517470
34 Popsi Kola 1.7275 147.310 -0.0225 1.353 2.337308 3.314475

[0, 1, 2, 5, 8, 9, 10, 12, 13, 15, 16, 19, 20, 21, 27, 28, 29, 30, 34]
total_cost = solver.IntVar(0, 300, "total_cost")

solver.Add(total_cost <= 300)
decision_builder = solver.Phase([x, y],
                                  solver.CHOOSE_FIRST_UNBOUND,
                                  solver.ASSIGN_MIN_VALUE)
# Total cost
solver.Add(
  total_cost < solver.Sum(solver.ScalProd(x_array, forecast_sales_r["marginal_investment"].values)))
objective = solver.Minimize(total_cost, 1)
db = solver.Phase(x_array,
                solver.CHOOSE_FIRST_UNBOUND, # The VAR strategy
                solver.ASSIGN_MIN_VALUE)     # The Value strategy

# Create a solution collector.
collector = solver.LastSolutionCollector()
# Add decision variables
#collector.Add(x_array)

for i in range(num_workers):
    collector.Add(x_worker[i])
---------------------------------------------------------------------------

NotImplementedError                       Traceback (most recent call last)

<ipython-input-126-c28e31bfa220> in <module>()
      1 # Total cost
      2 solver.Add(
----> 3   total_cost < solver.Sum(solver.ScalProd(x_array, forecast_sales_r["marginal_investment"].values)))
      4 objective = solver.Minimize(total_cost, 1)
      5 db = solver.Phase(x_array,


~/.local/lib/python3.6/site-packages/ortools/constraint_solver/pywrapcp.py in ScalProd(self, *args)
    393 
    394     def ScalProd(self, *args) -> "operations_research::IntExpr *":
--> 395         return _pywrapcp.Solver_ScalProd(self, *args)
    396 
    397     def MonotonicElement(self, values: 'operations_research::Solver::IndexEvaluator1', increasing: 'bool', index: 'IntVar') -> "operations_research::IntExpr *":


NotImplementedError: Wrong number or type of arguments for overloaded function 'Solver_ScalProd'.
  Possible C/C++ prototypes are:
    operations_research::Solver::MakeScalProd(std::vector< operations_research::IntVar *,std::allocator< operations_research::IntVar * > > const &,std::vector< int64,std::allocator< int64 > > const &)
    operations_research::Solver::MakeScalProd(std::vector< operations_research::IntVar *,std::allocator< operations_research::IntVar * > > const &,std::vector< int,std::allocator< int > > const &)
# Total cost
solver.Add(
  total_cost == solver.Sum( [solver.ScalProd(x_row, cost_row) for (x_row, cost_row) in zip(x_array, forecast_sales_r["marginal_investment"].values)]))
objective = solver.Minimize(total_cost, 1)
db = solver.Phase(x_array,
                solver.CHOOSE_FIRST_UNBOUND, # The VAR strategy
                solver.ASSIGN_MIN_VALUE)     # The Value strategy

# Create a solution collector.
collector = solver.LastSolutionCollector()
# Add decision variables
#collector.Add(x_array)

for i in range(num_workers):
    collector.Add(x_worker[i])
## I only want to do two promotions for easy soda category
## for now I will just ignore this constraint, 
## one soda onl once - true
max_prom
{'Clear': 2, 'Dark': 2}
for code, price, ranger in zip([ra[0] for ra in list(product_prices)],[ra[1] for ra in list(product_prices)],range(len([ra[1] for ra in list(product_prices)]))):
    print(code, price)
    select_price[code, ranger] = solver.IntVar(ub=price, lb=price, name='X')