I am trying to speed up my nested loop it currently takes 15 mins for 100k customers. I am also having trouble adding an additional condition that only multiplies states (A,B,C) by lookup2 val, else multiplies by 1.
customer_data = pd.DataFrame({"cust_id": [1, 2, 3, 4, 5, 6, 7, 8],
"state": ['B', 'E', 'D', 'A', 'B', 'E', 'C', 'A'],
"cust_amt": [1000,300, 500, 200, 400, 600, 200, 300],
"year":[3, 3, 4, 3, 4, 2, 2, 4],
"group":[10, 25, 30, 40, 55, 60, 70, 85]})
state_list = ['A','B','C','D','E']
# All lookups should be dataframes with the year and/or group and the value like these.
lookup1 = pd.DataFrame({'year': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'lim %': 0.1})
lookup2 = pd.concat([pd.DataFrame({'group':g, 'lookup_val': 0.1, 'year':range(1, 11)}
for g in customer_data['group'].unique())]).explode('year')
multi_data = np.arange(250).reshape(10,5,5)
lookups = [lookup1, lookup2]
# Preprocessing.
# Transform the state to categorical code to use it as array index.
customer_data['state'] = pd.Categorical(customer_data['state'],
categories=state_list,
ordered=True).codes
# Set index on lookups.
for i in range(len(lookups)):
if 'group' in lookups[i].columns:
lookups[i] = lookups[i].set_index(['year', 'group'])
else:
lookups[i] = lookups[i].set_index(['year'])
calculation:
results = {}
for customer, state, amount, start, group in customer_data.itertuples(name=None, index=False):
for year in range(start, len(multi_data)+1):
if year == start:
results[customer] = [[amount * multi_data[year-1, state, :]]]
else:
results[customer].append([results[customer][-1][-1] @ multi_data[year-1]])
for lookup in lookups:
if isinstance(lookup.index, pd.MultiIndex):
value = lookup.loc[(year, group)].iat[0]
else:
value = lookup.loc[year].iat[0]
results[customer][-1].append(value * results[customer][-1][-1])
example of expected output:
{1: [[array([55000, 56000, 57000, 58000, 59000]),
array([5500., 5600., 5700., 5800., 5900.]),
array([550., 560., 570., 5800., 5900.])],...
