Lessons in the Research-to-Production Pipeline: From Data Science to Software Engineering
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

269 lines
8.6 KiB

import dataclasses
import json
import numpy as np
import pandas as pd
import streamlit as st
from scipy.stats import distributions as dist
from scipy.stats import gaussian_kde as gkde
import plotly.express as px
import plotly.graph_objects as go
# st.set_page_config(layout="wide")
titles = ["Architect", "Engineer", "Sr. Engineer"]
num_samples = int(1e4)
@dataclasses.dataclass(frozen=True)
class Employee:
name: str = "John Doe"
title: str = "unknown"
region: int = 0
salary: float = 0.0
score: float = 0.5
def __eq__(cls, other_cls):
return cls.name == other_cls.name
def __lt__(cls, other_cls):
return cls.name < other_cls.name
default_employees = [
Employee("Alice", "Architect"),
Employee("Bob", "Architect"),
Employee("Cher", "Engineer"),
Employee("David", "Sr. Engineer"),
Employee("Eirene", "Engineer"),
Employee("Fiona", "Sr. Engineer"),
Employee("Gavin", "Engineer"),
]
# for i in range(5000):
# default_employees.append(Employee(f"Gavin {i}", np.random.choice(titles)))
try:
data = json.load(open("data.json", "r"))
except FileNotFoundError:
data = [
{"title": "Architect", "salary": [50000, 100000], "raise": [10, 20]},
{"title": "Engineer", "salary": [50000, 100000], "raise": [10, 20]},
{"title": "Sr. Engineer", "salary": [50000, 100000], "raise": [10, 20]},
]
titles = [d["title"] for d in data]
salaries = {d["title"]: d["salary"] for d in data}
increases = {d["title"]: d["raise"] for d in data}
# increases = {title: (10, 20) for title in titles}
# salaries = {title: (50000, 100000) for title in titles}
st.title("Payroll Calculator")
budget = st.sidebar.number_input(
"Maximum Payroll (dollars per year)", value=550000, step=5000
)
if "employees" not in st.session_state:
st.session_state["employees"] = set(default_employees)
def add_employee(employee_name, title="unknown", region=0, salary=0, score=0):
if "employees" in st.session_state and employee_name:
remove_employee(employee_name)
st.session_state["employees"].add(
Employee(employee_name, title, region, salary, score)
)
def remove_employee(employee_name):
if "employees" in st.session_state and employee_name:
for e in st.session_state["employees"]:
if e.name == employee_name:
st.session_state["employees"].remove(e)
break
if "increases" not in st.session_state:
st.session_state["increases"] = increases
if "salary_ranges" not in st.session_state:
st.session_state["salary_ranges"] = salaries
with st.expander("Roles"):
title = st.selectbox("Position", options=titles)
_inc = st.select_slider(
f"Percentage Increase for {title}",
value=st.session_state.increases[title],
options=np.arange(0, 100),
)
_sal = st.select_slider(
f"Salary Range for {title}",
value=st.session_state.salary_ranges[title],
options=np.arange(50000, 250001, 5000),
)
if st.button("Set"):
st.session_state.increases[title] = [int(i) for i in _inc]
st.session_state.salary_ranges[title] = [int(i) for i in _sal]
st.markdown("Updated role definition.")
with st.sidebar.expander("THE RED BOX"):
a = st.slider("Upper", value=3.0, min_value=1.0, max_value=3.0, step=0.25)
b = st.slider("Lower", value=1.0, min_value=1.0, max_value=3.0, step=0.25)
c = st.slider("%MAX", value=0.90, min_value=0.5, max_value=1.0, step=0.05)
with st.expander("Employees"):
st.markdown("You can consider promotions here as well.")
employee_title = st.selectbox("Employee position", options=titles)
employee_name = st.text_input("Employee name")
col1, col2, col3 = st.columns(3)
with col1:
salary = st.number_input("salary (optional)", value=0)
with col3:
region = st.number_input("region", value=0)
with col2:
performance = st.slider("performance", value=0.5)
add_new_employee = st.button("Add or update employee")
if add_new_employee:
add_employee(employee_name, employee_title, region, salary, performance)
rem_employee = st.button("Remove employee")
if rem_employee:
remove_employee(employee_name)
st.sidebar.markdown("### Employee Roster")
if st.session_state.get("employees"):
st.sidebar.write([e.__dict__ for e in sorted(st.session_state["employees"])])
employees = st.session_state.employees
# employees = default_employees
increases = st.session_state.increases
salary_ranges = st.session_state.salary_ranges
forecast = st.button("Forecast")
samples = {}
import multiprocessing as mp
import numpy as np
from copy import deepcopy
# st.write(employees == set(default_employees))
ss = np.random.SeedSequence()
def random_sampling(A):
rng, employee, num_samples, salary_ranges, increases = A
# return rng.random(int(num_samples))
# return employee.name
e = employee
sample = {}
# TODO: revisit by zip code / region. USE salary_ranges[region][title]
mnS, mxS = salary_ranges[e.title]
if e.salary == 0: # simulate salary if unspecified
current_salary = rng.random(int(num_samples)) * (mxS - mnS) + mnS
else:
current_salary = np.ones(num_samples) * e.salary
mnI, mxI = increases[e.title]
# TODO: revisit how score is used.
# now: up to 10% linear increase based on performance, must be over 0.5
random_increase = (100 + rng.random(int(num_samples)) * (mxI - mnI) + mnI) / 100
if e.score > 0.5:
random_increase *= (e.score - 0.5) / 5
sample["inc"], sample["old"], sample["new"] = (
random_increase,
current_salary,
np.minimum(current_salary * random_increase, mxS),
)
return (e.name, sample)
if forecast:
# n_proc = min(max(( 1, mp.cpu_count() - 1 )), len(employees))
n_proc = 8
pool = mp.Pool(processes=n_proc)
child_seeds = ss.spawn(len(employees))
st.sidebar.write("Exploring Possibilities")
# samples_raw = pool.starmap(f, [ (np.random.default_rng(s), e, num_samples, salary_ranges, increases) for s, e in zip(child_seeds, employees) ])
salary_ranges = st.session_state.salary_ranges
increases = st.session_state.increases
st.write(salary_ranges, increases)
samples_raw = map(
random_sampling,
[
(np.random.default_rng(s), e, num_samples, salary_ranges, increases)
for s, e in zip(child_seeds, employees)
],
)
samples = {s[0]: s[1] for s in samples_raw}
st.sidebar.write("Predicting Budgets")
old_salaries = np.array([samples[s]["old"] for s in samples]).T
new_salaries = np.array([samples[s]["new"] for s in samples]).T
old_payroll = old_salaries.sum(axis=1)
new_payroll = new_salaries.sum(axis=1)
fig = go.Figure()
mn, mx = round(old_payroll.min()), round(new_payroll.max())
fig.add_trace(
go.Histogram(x=old_payroll, histnorm="probability density", name="before")
)
fig.add_trace(
go.Histogram(
x=new_payroll,
histnorm="probability density",
name="after",
marker_color="yellow",
)
)
fig.add_vrect(
x0=c * budget,
x1=budget,
line_color="red",
line_width=5,
annotation_text="budget",
annotation_position="left",
)
fig.update_layout(
title="Salary Forecast",
xaxis_title="Required Amount ($)",
yaxis_title="",
font=dict(family="Courier New, monospace", size=18, color="#7f7f7f"),
)
st.sidebar.write("Performing Analysis")
kde = gkde(np.random.choice(new_payroll, num_samples // 5))
predicted_density = kde.pdf(new_payroll)
observed_density = dist.beta(a=a, b=b, loc=c * budget, scale=(1 - c) * budget).pdf(
new_payroll
)
ratio = observed_density / predicted_density
ratio = ratio / max(ratio)
accepted_inds = [r for r in range(num_samples) if np.random.rand() < ratio[r]]
new_salaries_updated = new_payroll[accepted_inds]
fig.add_trace(
go.Histogram(
x=new_salaries_updated, histnorm="probability density", name="options"
)
)
fig.update_layout(
legend=dict(
orientation="h",
yanchor="top",
xanchor="right",
y=1,
x=1,
)
)
st.plotly_chart(fig, use_container_width=True)
st.markdown(f"Summary of {len(accepted_inds)} feasible new salaries (ranked)")
df = pd.DataFrame(new_salaries[accepted_inds, :], columns=sorted(samples.keys()))
df["total"] = new_payroll[accepted_inds]
df = df.astype(int)
st.write(df.sort_values("total").reset_index(drop=True))