attribution_report/attribution.py
2015-07-17 10:57:43 -04:00

264 lines
10 KiB
Python

# -*- coding: utf-8 -*-
import os
import json
import datetime
import logging
import subprocess
import sys
import pandas as pd
from dateutil.relativedelta import relativedelta
import easygui
L = logging.getLogger(__name__)
def get_dropbox_dir():
"""
Windows and Mac get dropox dir for Business or fallback to personal
"""
if os.name == "nt":
dropbox_file = os.path.join(os.getenv('APPDATA'), 'Dropbox', 'info.json')
else:
dropbox_file = os.path.expanduser("~/.dropbox/info.json")
with open(dropbox_file) as dbf:
dbconfig = json.loads(dbf.read())
if "business" in dbconfig:
dropbox_dir = dbconfig['business']['path'] + "/*.xls"
elif "personal" in dbconfig:
dropbox_dir = dbconfig['personal']['path'] + "/*.xls"
else:
dropbox_dir = os.path.expanduser("~")
return dropbox_dir
class AttributionReport(object):
def __init__(self, months=6, footer_length=None):
self.months = months
self.footer_length = footer_length
self.SF_DATE_COLUMN = "Date"
self.DP_DATE_COLUMN = "Date Received"
self.PI_COLUMN = "PI_Name"
self.ORG_COLUMN = "Org Name"
# Output the XLSX in this order
self.OUTPUT_COLUMN_ORDER = ["Addgene Assigned", "Plasmid ID", "Deposit ID", "Institute", "PI Name",
"Date Received", "Original Date", "Original ORG", "Original PI"]
self.ACCEPTABLE_EXTENSIONS = ["*.csv", "*.xls", "*.xlsx"]
# columns that need to be in the files
self.REQUIRED_SF_COLUMNS = ["First Name", "Last Name", "Account Name", "Date", "Assigned"]
self.REQUIRED_DP_COLUMNS = ["Org Name", "Deposit ID", "Plasmid ID", "PI_Name", "Date Received"]
# After load and merging, delete these columns
self.SF_TRIM_COLUMNS = ["Subject", "First Name", "Last Name", "Created Date", "LIMS Organization ID",
"Account Description"]
self.DP_TRIM_COLUMNS = ["Org ID", "Deposit Status", "PI_ID", "Date Available", "# Orders",
"# Plasmids in the Deposit", "Addgene Contact", "Country"]
self.DEFAULT_DIR = get_dropbox_dir()
def _get_dataframe_by_extension(self, path, date_cols):
"""
Gets a dataframe either by .csv, or .xls(x),
or erroring and exiting.
"""
_, ext = os.path.splitext(path)
if ext == ".csv":
df = pd.read_csv(path, parse_dates=date_cols, encoding='utf-8')
elif ext in [".xlsx", ".xls"]:
df = pd.read_excel(path, parse_dates=date_cols, encoding='utf-8')
else:
easygui.msgbox("File was not of type {0}.\nQuitting".format(
" ".join(self.ACCEPTABLE_EXTENSIONS)),
"ERROR")
sys.exit(1)
return df
def get_dataframes(self):
salesforce_data_name = easygui.fileopenbox("Salesforce Export",
default=self.DEFAULT_DIR,
filetypes=self.ACCEPTABLE_EXTENSIONS)
if salesforce_data_name == ".":
easygui.msgbox("You did not select a Salesforce Export, stopping program.",
"Good Bye")
sys.exit(1)
salesforce_df = self._get_dataframe_by_extension(salesforce_data_name, date_cols=[4, 5])
if set(self.REQUIRED_SF_COLUMNS) < set(salesforce_df.columns):
L.info("Proper columns")
else:
L.info("Wrong columns")
easygui.msgbox("At a minimum, the Salesforce file must have the following columns:\n\n"
"{0}\n\n"
"Please re-run and select a proper file.".format(", ".join(self.REQUIRED_SF_COLUMNS)),
"Incorrect columns")
sys.exit(1)
deposit_data_name = easygui.fileopenbox("Deposit Data",
default=self.DEFAULT_DIR,
filetypes=self.ACCEPTABLE_EXTENSIONS)
if deposit_data_name == ".":
easygui.msgbox("You did not select a Deposit Data Export, stopping program.",
"Good Bye")
sys.exit(1)
deposit_df = self._get_dataframe_by_extension(deposit_data_name, date_cols=[7, 8])
if set(self.REQUIRED_DP_COLUMNS) < set(deposit_df.columns):
L.info("Proper columns")
else:
L.info("Wrong columns")
easygui.msgbox("At a minimum, the Deposit Data file must have the following columns:\n\n"
"{0}\n\n"
"Please re-run and select a proper file.".format(", ".join(self.REQUIRED_DP_COLUMNS)),
"Incorrect columns")
sys.exit(1)
salesforce_df, deposit_df = self.clean_dataframes(salesforce_df, deposit_df)
return salesforce_df, deposit_df
def clean_dataframes(self, salesforce_df, deposit_df):
# Get rid of the footer that Salesforce adds.
if self.footer_length:
length_with_footer = len(salesforce_df.index)
salesforce_df = salesforce_df.head(length_with_footer - self.footer_length)
# Clean up Salesforce
salesforce_df.sort(self.SF_DATE_COLUMN, ascending=1)
salesforce_df["Full Name"] = salesforce_df["First Name"].map(unicode) + " " + salesforce_df["Last Name"]
# Cleanup Deposit Data
deposit_df['Org Name'].fillna('', inplace=True)
deposit_df.sort(self.DP_DATE_COLUMN, ascending=1)
deposit_df['PI_Name'].astype(unicode)
# Cleanup not needed columns
for col in self.SF_TRIM_COLUMNS:
del salesforce_df[col]
for col in self.DP_TRIM_COLUMNS:
del deposit_df[col]
return salesforce_df, deposit_df
def get_filtered(self, filtered_df, sf_row, pi_name, pi_org, org=False):
"""
Assume kind is PI by default.
Filter where either the PI and PI match, or the Org and Org match
If both match, add it to the the double list
if only one matches, add it to the single list.
"""
filter_column = self.PI_COLUMN
filter_value = pi_name
single, double = [], []
if org:
filter_column = self.ORG_COLUMN
filter_value = pi_org
name_match = filtered_df[filtered_df[filter_column] == filter_value]
if not name_match.empty:
for _, row in name_match.iterrows():
data = {
"Addgene Assigned": sf_row['Assigned'],
"Plasmid ID": row['Plasmid ID'],
"Deposit ID": row['Deposit ID'],
"Institute": row['Org Name'],
"PI Name": row['PI_Name'],
"Date Received": row[self.DP_DATE_COLUMN],
"Original Date": sf_row[self.SF_DATE_COLUMN],
"Original ORG": pi_org,
"Original PI": pi_name,
}
if (data['Institute'] == data['Original ORG']) and \
(data['PI Name'] == data['Original PI']):
double.append(data)
else:
single.append(data)
return single, double
def get_attribution_dataframes(self):
salesforce_df, deposit_df = self.get_dataframes()
name_matches = []
org_matches = []
double_matches = []
mismatches = []
# Iterate through the Salesforce report as the master document
for index, sf_row in salesforce_df.iterrows():
# Get a start date and an end date for filtering.
start_date = sf_row[self.SF_DATE_COLUMN]
end_date = start_date + relativedelta(months=self.months)
start = deposit_df[self.DP_DATE_COLUMN].searchsorted(start_date)[0]
end = deposit_df[self.DP_DATE_COLUMN].searchsorted(end_date)[0]
# Filter the deposit data to grab only things within that timeframe.
filtered_df = deposit_df.ix[start:end]
# Variables for short names, and not having to type index a lot.
pi_name = unicode(sf_row['Full Name'])
pi_org = sf_row['Account Name']
# Get matches by the PI's name
by_name, pi_by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org)
name_matches.extend(by_name)
mismatches.extend(by_name)
double_matches.extend(pi_by_both)
# Get matches by the organization name
by_org, org_by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org, org=True)
org_matches.extend(by_org)
mismatches.extend(by_org)
double_matches.extend(org_by_both)
return (
("PI", pd.DataFrame(name_matches, columns=self.OUTPUT_COLUMN_ORDER)),
# ("Institute", pd.DataFrame(org_matches, columns=self.OUTPUT_COLUMN_ORDER)),
("Double", pd.DataFrame(double_matches, columns=self.OUTPUT_COLUMN_ORDER)),
# ("Single", pd.DataFrame(mismatches, columns=self.OUTPUT_COLUMN_ORDER))
)
def run(self):
frames = self.get_attribution_dataframes()
self.dirname = easygui.diropenbox("Where to save reports?", "Select Report Output Directory", self.DEFAULT_DIR)
if not self.dirname:
self.dirname = self.DEFAULT_DIR
for key, df in frames:
fname = '{0}_Attribution_Report_{1}_Match.xlsx'.format(datetime.date.today(), key)
xls_path = os.path.join(self.dirname, fname)
deduped_df = df.drop_duplicates()
with pd.ExcelWriter(xls_path, engine='xlsxwriter') as writer:
deduped_df.to_excel(writer, sheet_name='Sheet1', index=False)
if os.name == "nt":
subprocess.call("explorer {0}".format(self.dirname),shell=True)
else:
# Open the last path
subprocess.call(["open", "-R", xls_path])
def main():
try:
report = AttributionReport(months=6, footer_length=6)
report.run()
easygui.msgbox("Done, your file are saved where you chose.", "Done!")
except:
easygui.exceptionbox()
if __name__ == '__main__':
main()