From 146519b5f0d7242db2232d60038f7a6ef59351a8 Mon Sep 17 00:00:00 2001 From: Michael Pilosov Date: Thu, 26 May 2022 16:02:48 +0000 Subject: [PATCH] make more reproducible and better documented --- salary/.dockerignore | 2 + salary/Dockerfile | 13 ++ salary/Makefile | 10 ++ salary/README.md | 36 ++++++ salary/app.py | 269 ++++++++++++++++++++++++++++++++++++++++ salary/data.json | 5 + salary/requirements.txt | 6 + salary/st | 15 +++ 8 files changed, 356 insertions(+) create mode 100644 salary/.dockerignore create mode 100644 salary/Dockerfile create mode 100644 salary/Makefile create mode 100644 salary/README.md create mode 100644 salary/app.py create mode 100644 salary/data.json create mode 100644 salary/requirements.txt create mode 100755 salary/st diff --git a/salary/.dockerignore b/salary/.dockerignore new file mode 100644 index 0000000..2edd2b0 --- /dev/null +++ b/salary/.dockerignore @@ -0,0 +1,2 @@ +data.json +README.md diff --git a/salary/Dockerfile b/salary/Dockerfile new file mode 100644 index 0000000..1e6d96c --- /dev/null +++ b/salary/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.9.3 + +WORKDIR /app + +COPY app.py ./app.py +COPY data.json ./data.json +COPY requirements.txt ./requirements.txt + +RUN pip3 install -r requirements.txt + +# swap health-check endpoint to be where GCP looks for it by default +# RUN find /usr/local/lib/python3.9/site-packages/streamlit -type f \( -iname \*.py -o -iname \*.js \) -print0 | xargs -0 sed -i 's/healthz/health-check/g' + diff --git a/salary/Makefile b/salary/Makefile new file mode 100644 index 0000000..2bc3549 --- /dev/null +++ b/salary/Makefile @@ -0,0 +1,10 @@ +run: + ./st run app.py + +install: + pip install -r requirements.txt + +build: + docker build -t streamlit:latest . + +.PHONY: run install build diff --git a/salary/README.md b/salary/README.md new file mode 100644 index 0000000..5d74f49 --- /dev/null +++ b/salary/README.md @@ -0,0 +1,36 @@ +# Salary Assessment App + +Demonstration of an interactive web-application built in python. +This application takes in an employee roster (built interactively, best for small teams), with names, positions, and salaries for each employee. +It then allows someone (e.g. a manager) to simulate promotions and/or salary raises and see the overall impact on the final payroll budget (visualized as a pair of overlaying distributions which show the before/after scenario under the provided ranges). +Finally, the app "solves the problem" of assigning a new salary to each employee while staying below some pre-defined budget for payroll, using a novel Monte-Carlo method implemented directly within the app itself (rather than relying on importing another library). + +> This example shows "a small amount of effort" exerted to ensure reproducibility and readability but ultimately lacks in overall user-friendliness. + +Takeaways: *Good reproducibility, passable style/formatting.* + +- Of particular note here is the executable `run` shell script which provides support for running the application using a docker image as well as simultaneously supporting native shell execution if `docker` is not in the system `$PATH` +- Note the style in which functions are written in `app.py` + - There is a mixture of functions and procedural code, wide abuse of global variables, and a lot of messy plotting code. Is it readable overall? + - The app's "state" is held in `data.json`, is saved in a human-readable format (as opposed to binary), and is small enough to be negligible + - The functions that are defined at least have readable names + - If the app grew any larger than this, one might be wise to migrate functions into a separate module and import it in `app.py` +- Note the lack of a proper `README` / we just "presume" the user knows what to do with the presence of a `Makefile`. Is this acceptable in your opinion? + - Very minimal `Makefile` is "sort of self-documenting" but many people don't know to look there. It could use documentation so that `make help` at least produces some sort of a helpful guide + + +## Usage + +(if using local `python` instead of `docker`): +``` +make install +``` + +then + +```bash +make run +``` + +and visit `localhost:8501` (or `//proxy/8501/` in Jupyter if you have `jupyter-server-proxy` installed to access the app via proxy) + diff --git a/salary/app.py b/salary/app.py new file mode 100644 index 0000000..ca35619 --- /dev/null +++ b/salary/app.py @@ -0,0 +1,269 @@ +import dataclasses +import json + +import numpy as np +import pandas as pd +import streamlit as st +from scipy.stats import distributions as dist +from scipy.stats import gaussian_kde as gkde +import plotly.express as px +import plotly.graph_objects as go + +# st.set_page_config(layout="wide") +titles = ["Architect", "Engineer", "Sr. Engineer"] +num_samples = int(1e4) + + +@dataclasses.dataclass(frozen=True) +class Employee: + name: str = "John Doe" + title: str = "unknown" + region: int = 0 + salary: float = 0.0 + score: float = 0.5 + + def __eq__(cls, other_cls): + return cls.name == other_cls.name + + def __lt__(cls, other_cls): + return cls.name < other_cls.name + + +default_employees = [ + Employee("Alice", "Architect"), + Employee("Bob", "Architect"), + Employee("Cher", "Engineer"), + Employee("David", "Sr. Engineer"), + Employee("Eirene", "Engineer"), + Employee("Fiona", "Sr. Engineer"), + Employee("Gavin", "Engineer"), +] +# for i in range(5000): +# default_employees.append(Employee(f"Gavin {i}", np.random.choice(titles))) + + +try: + data = json.load(open("data.json", "r")) +except FileNotFoundError: + data = [ + {"title": "Architect", "salary": [50000, 100000], "raise": [10, 20]}, + {"title": "Engineer", "salary": [50000, 100000], "raise": [10, 20]}, + {"title": "Sr. Engineer", "salary": [50000, 100000], "raise": [10, 20]}, + ] + +titles = [d["title"] for d in data] +salaries = {d["title"]: d["salary"] for d in data} +increases = {d["title"]: d["raise"] for d in data} +# increases = {title: (10, 20) for title in titles} +# salaries = {title: (50000, 100000) for title in titles} + +st.title("Payroll Calculator") +budget = st.sidebar.number_input( + "Maximum Payroll (dollars per year)", value=550000, step=5000 +) + +if "employees" not in st.session_state: + st.session_state["employees"] = set(default_employees) + + +def add_employee(employee_name, title="unknown", region=0, salary=0, score=0): + if "employees" in st.session_state and employee_name: + remove_employee(employee_name) + st.session_state["employees"].add( + Employee(employee_name, title, region, salary, score) + ) + + +def remove_employee(employee_name): + if "employees" in st.session_state and employee_name: + for e in st.session_state["employees"]: + if e.name == employee_name: + st.session_state["employees"].remove(e) + break + + +if "increases" not in st.session_state: + st.session_state["increases"] = increases + + +if "salary_ranges" not in st.session_state: + st.session_state["salary_ranges"] = salaries + +with st.expander("Roles"): + title = st.selectbox("Position", options=titles) + _inc = st.select_slider( + f"Percentage Increase for {title}", + value=st.session_state.increases[title], + options=np.arange(0, 100), + ) + _sal = st.select_slider( + f"Salary Range for {title}", + value=st.session_state.salary_ranges[title], + options=np.arange(50000, 250001, 5000), + ) + if st.button("Set"): + st.session_state.increases[title] = [int(i) for i in _inc] + st.session_state.salary_ranges[title] = [int(i) for i in _sal] + st.markdown("Updated role definition.") + + +with st.sidebar.expander("THE RED BOX"): + a = st.slider("Upper", value=3.0, min_value=1.0, max_value=3.0, step=0.25) + b = st.slider("Lower", value=1.0, min_value=1.0, max_value=3.0, step=0.25) + c = st.slider("%MAX", value=0.90, min_value=0.5, max_value=1.0, step=0.05) + + +with st.expander("Employees"): + st.markdown("You can consider promotions here as well.") + employee_title = st.selectbox("Employee position", options=titles) + employee_name = st.text_input("Employee name") + col1, col2, col3 = st.columns(3) + with col1: + salary = st.number_input("salary (optional)", value=0) + with col3: + region = st.number_input("region", value=0) + with col2: + performance = st.slider("performance", value=0.5) + + add_new_employee = st.button("Add or update employee") + if add_new_employee: + add_employee(employee_name, employee_title, region, salary, performance) + + rem_employee = st.button("Remove employee") + if rem_employee: + remove_employee(employee_name) + + st.sidebar.markdown("### Employee Roster") + if st.session_state.get("employees"): + st.sidebar.write([e.__dict__ for e in sorted(st.session_state["employees"])]) + + +employees = st.session_state.employees +# employees = default_employees + +increases = st.session_state.increases +salary_ranges = st.session_state.salary_ranges +forecast = st.button("Forecast") +samples = {} +import multiprocessing as mp +import numpy as np +from copy import deepcopy + +# st.write(employees == set(default_employees)) + +ss = np.random.SeedSequence() + + +def random_sampling(A): + rng, employee, num_samples, salary_ranges, increases = A + # return rng.random(int(num_samples)) + # return employee.name + e = employee + sample = {} + + # TODO: revisit by zip code / region. USE salary_ranges[region][title] + mnS, mxS = salary_ranges[e.title] + if e.salary == 0: # simulate salary if unspecified + current_salary = rng.random(int(num_samples)) * (mxS - mnS) + mnS + else: + current_salary = np.ones(num_samples) * e.salary + + mnI, mxI = increases[e.title] + # TODO: revisit how score is used. + # now: up to 10% linear increase based on performance, must be over 0.5 + random_increase = (100 + rng.random(int(num_samples)) * (mxI - mnI) + mnI) / 100 + if e.score > 0.5: + random_increase *= (e.score - 0.5) / 5 + + sample["inc"], sample["old"], sample["new"] = ( + random_increase, + current_salary, + np.minimum(current_salary * random_increase, mxS), + ) + return (e.name, sample) + + +if forecast: + # n_proc = min(max(( 1, mp.cpu_count() - 1 )), len(employees)) + n_proc = 8 + pool = mp.Pool(processes=n_proc) + child_seeds = ss.spawn(len(employees)) + st.sidebar.write("Exploring Possibilities") + # samples_raw = pool.starmap(f, [ (np.random.default_rng(s), e, num_samples, salary_ranges, increases) for s, e in zip(child_seeds, employees) ]) + salary_ranges = st.session_state.salary_ranges + increases = st.session_state.increases + st.write(salary_ranges, increases) + samples_raw = map( + random_sampling, + [ + (np.random.default_rng(s), e, num_samples, salary_ranges, increases) + for s, e in zip(child_seeds, employees) + ], + ) + samples = {s[0]: s[1] for s in samples_raw} + st.sidebar.write("Predicting Budgets") + old_salaries = np.array([samples[s]["old"] for s in samples]).T + new_salaries = np.array([samples[s]["new"] for s in samples]).T + old_payroll = old_salaries.sum(axis=1) + new_payroll = new_salaries.sum(axis=1) + + fig = go.Figure() + mn, mx = round(old_payroll.min()), round(new_payroll.max()) + fig.add_trace( + go.Histogram(x=old_payroll, histnorm="probability density", name="before") + ) + fig.add_trace( + go.Histogram( + x=new_payroll, + histnorm="probability density", + name="after", + marker_color="yellow", + ) + ) + fig.add_vrect( + x0=c * budget, + x1=budget, + line_color="red", + line_width=5, + annotation_text="budget", + annotation_position="left", + ) + fig.update_layout( + title="Salary Forecast", + xaxis_title="Required Amount ($)", + yaxis_title="", + font=dict(family="Courier New, monospace", size=18, color="#7f7f7f"), + ) + + st.sidebar.write("Performing Analysis") + kde = gkde(np.random.choice(new_payroll, num_samples // 5)) + predicted_density = kde.pdf(new_payroll) + + observed_density = dist.beta(a=a, b=b, loc=c * budget, scale=(1 - c) * budget).pdf( + new_payroll + ) + ratio = observed_density / predicted_density + ratio = ratio / max(ratio) + accepted_inds = [r for r in range(num_samples) if np.random.rand() < ratio[r]] + new_salaries_updated = new_payroll[accepted_inds] + fig.add_trace( + go.Histogram( + x=new_salaries_updated, histnorm="probability density", name="options" + ) + ) + fig.update_layout( + legend=dict( + orientation="h", + yanchor="top", + xanchor="right", + y=1, + x=1, + ) + ) + st.plotly_chart(fig, use_container_width=True) + + st.markdown(f"Summary of {len(accepted_inds)} feasible new salaries (ranked)") + df = pd.DataFrame(new_salaries[accepted_inds, :], columns=sorted(samples.keys())) + df["total"] = new_payroll[accepted_inds] + df = df.astype(int) + st.write(df.sort_values("total").reset_index(drop=True)) diff --git a/salary/data.json b/salary/data.json new file mode 100644 index 0000000..39b82c9 --- /dev/null +++ b/salary/data.json @@ -0,0 +1,5 @@ +[ + {"title": "Architect", "salary": [50000, 100000], "raise": [10, 20]}, + {"title": "Engineer", "salary": [50000, 100000], "raise": [10, 20]}, + {"title": "Sr. Engineer", "salary": [50000, 100000], "raise": [10, 20]} +] diff --git a/salary/requirements.txt b/salary/requirements.txt new file mode 100644 index 0000000..e410c15 --- /dev/null +++ b/salary/requirements.txt @@ -0,0 +1,6 @@ +streamlit +plotly-express +pandas +numpy +scipy +Equation diff --git a/salary/st b/salary/st new file mode 100755 index 0000000..09708d1 --- /dev/null +++ b/salary/st @@ -0,0 +1,15 @@ +#!/bin/bash +echo -e "INFO:\tThis executable is a replacement for invoking \`streamlit\`; it will attempt to first launch a docker image \`streamlit:latest\` and if it cannot find \`docker\` then it will attempt to invoke \`streamlit\` directly (you will need to run \`pip install -r requirements.txt\` for it to work)\n\n" + +IMAGE_NAME=streamlit:latest +COMMAND="streamlit" +OPTS="--browser.serverAddress 0.0.0.0 --server.enableCORS False --server.enableXsrfProtection False" + +if ! command -v docker &> /dev/null +then + echo -e "WARNING:\tdocker could not be found, attempting running locally...\n" + $COMMAND $@ $OPTS +else + echo -e "INFO:\t mounting \`pwd\` into container at mountpoint (and working directory) \`/tmp\` so that latest version of app & state are reflected.\n" + docker run --name streamlit --rm -d -p 8501:8501 -v "$(pwd)":/tmp -w /tmp "$IMAGE_NAME" "$COMMAND" $@ $OPTS +fi