research-to-production/salary/app.py

import dataclasses
import json

import numpy as np
import pandas as pd
import streamlit as st
from scipy.stats import distributions as dist
from scipy.stats import gaussian_kde as gkde
import plotly.express as px
import plotly.graph_objects as go

# st.set_page_config(layout="wide")
titles = ["Architect", "Engineer", "Sr. Engineer"]
num_samples = int(1e4)


@dataclasses.dataclass(frozen=True)
class Employee:
    name: str = "John Doe"
    title: str = "unknown"
    region: int = 0
    salary: float = 0.0
    score: float = 0.5

    def __eq__(cls, other_cls):
        return cls.name == other_cls.name

    def __lt__(cls, other_cls):
        return cls.name < other_cls.name


default_employees = [
    Employee("Alice", "Architect"),
    Employee("Bob", "Architect"),
    Employee("Cher", "Engineer"),
    Employee("David", "Sr. Engineer"),
    Employee("Eirene", "Engineer"),
    Employee("Fiona", "Sr. Engineer"),
    Employee("Gavin", "Engineer"),
]
# for i in range(5000):
#     default_employees.append(Employee(f"Gavin {i}", np.random.choice(titles)))


try:
    data = json.load(open("data.json", "r"))
except FileNotFoundError:
    data = [
        {"title": "Architect", "salary": [50000, 100000], "raise": [10, 20]},
        {"title": "Engineer", "salary": [50000, 100000], "raise": [10, 20]},
        {"title": "Sr. Engineer", "salary": [50000, 100000], "raise": [10, 20]},
    ]

titles = [d["title"] for d in data]
salaries = {d["title"]: d["salary"] for d in data}
increases = {d["title"]: d["raise"] for d in data}
# increases = {title: (10, 20) for title in titles}
# salaries = {title: (50000, 100000) for title in titles}

st.title("Payroll Calculator")
budget = st.sidebar.number_input(
    "Maximum Payroll (dollars per year)", value=550000, step=5000
)

if "employees" not in st.session_state:
    st.session_state["employees"] = set(default_employees)


def add_employee(employee_name, title="unknown", region=0, salary=0, score=0):
    if "employees" in st.session_state and employee_name:
        remove_employee(employee_name)
        st.session_state["employees"].add(
            Employee(employee_name, title, region, salary, score)
        )


def remove_employee(employee_name):
    if "employees" in st.session_state and employee_name:
        for e in st.session_state["employees"]:
            if e.name == employee_name:
                st.session_state["employees"].remove(e)
                break


if "increases" not in st.session_state:
    st.session_state["increases"] = increases


if "salary_ranges" not in st.session_state:
    st.session_state["salary_ranges"] = salaries

with st.expander("Roles"):
    title = st.selectbox("Position", options=titles)
    _inc = st.select_slider(
        f"Percentage Increase for {title}",
        value=st.session_state.increases[title],
        options=np.arange(0, 100),
    )
    _sal = st.select_slider(
        f"Salary Range for {title}",
        value=st.session_state.salary_ranges[title],
        options=np.arange(50000, 250001, 5000),
    )
    if st.button("Set"):
        st.session_state.increases[title] = [int(i) for i in _inc]
        st.session_state.salary_ranges[title] = [int(i) for i in _sal]
        st.markdown("Updated role definition.")


with st.sidebar.expander("THE RED BOX"):
    a = st.slider("Upper", value=3.0, min_value=1.0, max_value=3.0, step=0.25)
    b = st.slider("Lower", value=1.0, min_value=1.0, max_value=3.0, step=0.25)
    c = st.slider("%MAX", value=0.90, min_value=0.5, max_value=1.0, step=0.05)


with st.expander("Employees"):
    st.markdown("You can consider promotions here as well.")
    employee_title = st.selectbox("Employee position", options=titles)
    employee_name = st.text_input("Employee name")
    col1, col2, col3 = st.columns(3)
    with col1:
        salary = st.number_input("salary (optional)", value=0)
    with col3:
        region = st.number_input("region", value=0)
    with col2:
        performance = st.slider("performance", value=0.5)

    add_new_employee = st.button("Add or update employee")
    if add_new_employee:
        add_employee(employee_name, employee_title, region, salary, performance)

    rem_employee = st.button("Remove employee")
    if rem_employee:
        remove_employee(employee_name)

    st.sidebar.markdown("### Employee Roster")
    if st.session_state.get("employees"):
        st.sidebar.write([e.__dict__ for e in sorted(st.session_state["employees"])])


employees = st.session_state.employees
# employees = default_employees

increases = st.session_state.increases
salary_ranges = st.session_state.salary_ranges
forecast = st.button("Forecast")
samples = {}
import multiprocessing as mp
import numpy as np
from copy import deepcopy

# st.write(employees == set(default_employees))

ss = np.random.SeedSequence()


def random_sampling(A):
    rng, employee, num_samples, salary_ranges, increases = A
    # return rng.random(int(num_samples))
    # return employee.name
    e = employee
    sample = {}

    # TODO: revisit by zip code / region. USE salary_ranges[region][title]
    mnS, mxS = salary_ranges[e.title]
    if e.salary == 0:  # simulate salary if unspecified
        current_salary = rng.random(int(num_samples)) * (mxS - mnS) + mnS
    else:
        current_salary = np.ones(num_samples) * e.salary

    mnI, mxI = increases[e.title]
    # TODO: revisit how score is used.
    # now: up to 10% linear increase based on performance, must be over 0.5
    random_increase = (100 + rng.random(int(num_samples)) * (mxI - mnI) + mnI) / 100
    if e.score > 0.5:
        random_increase *= (e.score - 0.5) / 5

    sample["inc"], sample["old"], sample["new"] = (
        random_increase,
        current_salary,
        np.minimum(current_salary * random_increase, mxS),
    )
    return (e.name, sample)


if forecast:
    # n_proc = min(max(( 1, mp.cpu_count() - 1 )), len(employees))
    n_proc = 8
    pool = mp.Pool(processes=n_proc)
    child_seeds = ss.spawn(len(employees))
    st.sidebar.write("Exploring Possibilities")
    # samples_raw = pool.starmap(f, [ (np.random.default_rng(s), e, num_samples, salary_ranges, increases) for s, e in zip(child_seeds, employees) ])
    salary_ranges = st.session_state.salary_ranges
    increases = st.session_state.increases
    st.write(salary_ranges, increases)
    samples_raw = map(
        random_sampling,
        [
            (np.random.default_rng(s), e, num_samples, salary_ranges, increases)
            for s, e in zip(child_seeds, employees)
        ],
    )
    samples = {s[0]: s[1] for s in samples_raw}
    st.sidebar.write("Predicting Budgets")
    old_salaries = np.array([samples[s]["old"] for s in samples]).T
    new_salaries = np.array([samples[s]["new"] for s in samples]).T
    old_payroll = old_salaries.sum(axis=1)
    new_payroll = new_salaries.sum(axis=1)

    fig = go.Figure()
    mn, mx = round(old_payroll.min()), round(new_payroll.max())
    fig.add_trace(
        go.Histogram(x=old_payroll, histnorm="probability density", name="before")
    )
    fig.add_trace(
        go.Histogram(
            x=new_payroll,
            histnorm="probability density",
            name="after",
            marker_color="yellow",
        )
    )
    fig.add_vrect(
        x0=c * budget,
        x1=budget,
        line_color="red",
        line_width=5,
        annotation_text="budget",
        annotation_position="left",
    )
    fig.update_layout(
        title="Salary Forecast",
        xaxis_title="Required Amount ($)",
        yaxis_title="",
        font=dict(family="Courier New, monospace", size=18, color="#7f7f7f"),
    )

    st.sidebar.write("Performing Analysis")
    kde = gkde(np.random.choice(new_payroll, num_samples // 5))
    predicted_density = kde.pdf(new_payroll)

    observed_density = dist.beta(a=a, b=b, loc=c * budget, scale=(1 - c) * budget).pdf(
        new_payroll
    )
    ratio = observed_density / predicted_density
    ratio = ratio / max(ratio)
    accepted_inds = [r for r in range(num_samples) if np.random.rand() < ratio[r]]
    new_salaries_updated = new_payroll[accepted_inds]
    fig.add_trace(
        go.Histogram(
            x=new_salaries_updated, histnorm="probability density", name="options"
        )
    )
    fig.update_layout(
        legend=dict(
            orientation="h",
            yanchor="top",
            xanchor="right",
            y=1,
            x=1,
        )
    )
    st.plotly_chart(fig, use_container_width=True)

    st.markdown(f"Summary of {len(accepted_inds)} feasible new salaries (ranked)")
    df = pd.DataFrame(new_salaries[accepted_inds, :], columns=sorted(samples.keys()))
    df["total"] = new_payroll[accepted_inds]
    df = df.astype(int)
    st.write(df.sort_values("total").reset_index(drop=True))
make more reproducible and better documented 2022-05-26 16:02:48 +00:00			`import dataclasses`
			`import json`

			`import numpy as np`
			`import pandas as pd`
			`import streamlit as st`
			`from scipy.stats import distributions as dist`
			`from scipy.stats import gaussian_kde as gkde`
			`import plotly.express as px`
			`import plotly.graph_objects as go`

			`# st.set_page_config(layout="wide")`
			`titles = ["Architect", "Engineer", "Sr. Engineer"]`
			`num_samples = int(1e4)`


			`@dataclasses.dataclass(frozen=True)`
			`class Employee:`
			`name: str = "John Doe"`
			`title: str = "unknown"`
			`region: int = 0`
			`salary: float = 0.0`
			`score: float = 0.5`

			`def __eq__(cls, other_cls):`
			`return cls.name == other_cls.name`

			`def __lt__(cls, other_cls):`
			`return cls.name < other_cls.name`


			`default_employees = [`
			`Employee("Alice", "Architect"),`
			`Employee("Bob", "Architect"),`
			`Employee("Cher", "Engineer"),`
			`Employee("David", "Sr. Engineer"),`
			`Employee("Eirene", "Engineer"),`
			`Employee("Fiona", "Sr. Engineer"),`
			`Employee("Gavin", "Engineer"),`
			`]`
			`# for i in range(5000):`
			`# default_employees.append(Employee(f"Gavin {i}", np.random.choice(titles)))`


			`try:`
			`data = json.load(open("data.json", "r"))`
			`except FileNotFoundError:`
			`data = [`
			`{"title": "Architect", "salary": [50000, 100000], "raise": [10, 20]},`
			`{"title": "Engineer", "salary": [50000, 100000], "raise": [10, 20]},`
			`{"title": "Sr. Engineer", "salary": [50000, 100000], "raise": [10, 20]},`
			`]`

			`titles = [d["title"] for d in data]`
			`salaries = {d["title"]: d["salary"] for d in data}`
			`increases = {d["title"]: d["raise"] for d in data}`
			`# increases = {title: (10, 20) for title in titles}`
			`# salaries = {title: (50000, 100000) for title in titles}`

			`st.title("Payroll Calculator")`
			`budget = st.sidebar.number_input(`
			`"Maximum Payroll (dollars per year)", value=550000, step=5000`
			`)`

			`if "employees" not in st.session_state:`
			`st.session_state["employees"] = set(default_employees)`


			`def add_employee(employee_name, title="unknown", region=0, salary=0, score=0):`
			`if "employees" in st.session_state and employee_name:`
			`remove_employee(employee_name)`
			`st.session_state["employees"].add(`
			`Employee(employee_name, title, region, salary, score)`
			`)`


			`def remove_employee(employee_name):`
			`if "employees" in st.session_state and employee_name:`
			`for e in st.session_state["employees"]:`
			`if e.name == employee_name:`
			`st.session_state["employees"].remove(e)`
			`break`


			`if "increases" not in st.session_state:`
			`st.session_state["increases"] = increases`


			`if "salary_ranges" not in st.session_state:`
			`st.session_state["salary_ranges"] = salaries`

			`with st.expander("Roles"):`
			`title = st.selectbox("Position", options=titles)`
			`_inc = st.select_slider(`
			`f"Percentage Increase for {title}",`
			`value=st.session_state.increases[title],`
			`options=np.arange(0, 100),`
			`)`
			`_sal = st.select_slider(`
			`f"Salary Range for {title}",`
			`value=st.session_state.salary_ranges[title],`
			`options=np.arange(50000, 250001, 5000),`
			`)`
			`if st.button("Set"):`
			`st.session_state.increases[title] = [int(i) for i in _inc]`
			`st.session_state.salary_ranges[title] = [int(i) for i in _sal]`
			`st.markdown("Updated role definition.")`


			`with st.sidebar.expander("THE RED BOX"):`
			`a = st.slider("Upper", value=3.0, min_value=1.0, max_value=3.0, step=0.25)`
			`b = st.slider("Lower", value=1.0, min_value=1.0, max_value=3.0, step=0.25)`
			`c = st.slider("%MAX", value=0.90, min_value=0.5, max_value=1.0, step=0.05)`


			`with st.expander("Employees"):`
			`st.markdown("You can consider promotions here as well.")`
			`employee_title = st.selectbox("Employee position", options=titles)`
			`employee_name = st.text_input("Employee name")`
			`col1, col2, col3 = st.columns(3)`
			`with col1:`
			`salary = st.number_input("salary (optional)", value=0)`
			`with col3:`
			`region = st.number_input("region", value=0)`
			`with col2:`
			`performance = st.slider("performance", value=0.5)`

			`add_new_employee = st.button("Add or update employee")`
			`if add_new_employee:`
			`add_employee(employee_name, employee_title, region, salary, performance)`

			`rem_employee = st.button("Remove employee")`
			`if rem_employee:`
			`remove_employee(employee_name)`

			`st.sidebar.markdown("### Employee Roster")`
			`if st.session_state.get("employees"):`
			`st.sidebar.write([e.__dict__ for e in sorted(st.session_state["employees"])])`


			`employees = st.session_state.employees`
			`# employees = default_employees`

			`increases = st.session_state.increases`
			`salary_ranges = st.session_state.salary_ranges`
			`forecast = st.button("Forecast")`
			`samples = {}`
			`import multiprocessing as mp`
			`import numpy as np`
			`from copy import deepcopy`

			`# st.write(employees == set(default_employees))`

			`ss = np.random.SeedSequence()`


			`def random_sampling(A):`
			`rng, employee, num_samples, salary_ranges, increases = A`
			`# return rng.random(int(num_samples))`
			`# return employee.name`
			`e = employee`
			`sample = {}`

			`# TODO: revisit by zip code / region. USE salary_ranges[region][title]`
			`mnS, mxS = salary_ranges[e.title]`
			`if e.salary == 0: # simulate salary if unspecified`
			`current_salary = rng.random(int(num_samples)) * (mxS - mnS) + mnS`
			`else:`
			`current_salary = np.ones(num_samples) * e.salary`

			`mnI, mxI = increases[e.title]`
			`# TODO: revisit how score is used.`
			`# now: up to 10% linear increase based on performance, must be over 0.5`
			`random_increase = (100 + rng.random(int(num_samples)) * (mxI - mnI) + mnI) / 100`
			`if e.score > 0.5:`
			`random_increase *= (e.score - 0.5) / 5`

			`sample["inc"], sample["old"], sample["new"] = (`
			`random_increase,`
			`current_salary,`
			`np.minimum(current_salary * random_increase, mxS),`
			`)`
			`return (e.name, sample)`


			`if forecast:`
			`# n_proc = min(max(( 1, mp.cpu_count() - 1 )), len(employees))`
			`n_proc = 8`
			`pool = mp.Pool(processes=n_proc)`
			`child_seeds = ss.spawn(len(employees))`
			`st.sidebar.write("Exploring Possibilities")`
			`# samples_raw = pool.starmap(f, [ (np.random.default_rng(s), e, num_samples, salary_ranges, increases) for s, e in zip(child_seeds, employees) ])`
			`salary_ranges = st.session_state.salary_ranges`
			`increases = st.session_state.increases`
			`st.write(salary_ranges, increases)`
			`samples_raw = map(`
			`random_sampling,`
			`[`
			`(np.random.default_rng(s), e, num_samples, salary_ranges, increases)`
			`for s, e in zip(child_seeds, employees)`
			`],`
			`)`
			`samples = {s[0]: s[1] for s in samples_raw}`
			`st.sidebar.write("Predicting Budgets")`
			`old_salaries = np.array([samples[s]["old"] for s in samples]).T`
			`new_salaries = np.array([samples[s]["new"] for s in samples]).T`
			`old_payroll = old_salaries.sum(axis=1)`
			`new_payroll = new_salaries.sum(axis=1)`

			`fig = go.Figure()`
			`mn, mx = round(old_payroll.min()), round(new_payroll.max())`
			`fig.add_trace(`
			`go.Histogram(x=old_payroll, histnorm="probability density", name="before")`
			`)`
			`fig.add_trace(`
			`go.Histogram(`
			`x=new_payroll,`
			`histnorm="probability density",`
			`name="after",`
			`marker_color="yellow",`
			`)`
			`)`
			`fig.add_vrect(`
			`x0=c * budget,`
			`x1=budget,`
			`line_color="red",`
			`line_width=5,`
			`annotation_text="budget",`
			`annotation_position="left",`
			`)`
			`fig.update_layout(`
			`title="Salary Forecast",`
			`xaxis_title="Required Amount ($)",`
			`yaxis_title="",`
			`font=dict(family="Courier New, monospace", size=18, color="#7f7f7f"),`
			`)`

			`st.sidebar.write("Performing Analysis")`
			`kde = gkde(np.random.choice(new_payroll, num_samples // 5))`
			`predicted_density = kde.pdf(new_payroll)`

			`observed_density = dist.beta(a=a, b=b, loc=c * budget, scale=(1 - c) * budget).pdf(`
			`new_payroll`
			`)`
			`ratio = observed_density / predicted_density`
			`ratio = ratio / max(ratio)`
			`accepted_inds = [r for r in range(num_samples) if np.random.rand() < ratio[r]]`
			`new_salaries_updated = new_payroll[accepted_inds]`
			`fig.add_trace(`
			`go.Histogram(`
			`x=new_salaries_updated, histnorm="probability density", name="options"`
			`)`
			`)`
			`fig.update_layout(`
			`legend=dict(`
			`orientation="h",`
			`yanchor="top",`
			`xanchor="right",`
			`y=1,`
			`x=1,`
			`)`
			`)`
			`st.plotly_chart(fig, use_container_width=True)`

			`st.markdown(f"Summary of {len(accepted_inds)} feasible new salaries (ranked)")`
			`df = pd.DataFrame(new_salaries[accepted_inds, :], columns=sorted(samples.keys()))`
			`df["total"] = new_payroll[accepted_inds]`
			`df = df.astype(int)`
			`st.write(df.sort_values("total").reset_index(drop=True))`