attribution_report/main.py
2015-07-16 16:21:22 -04:00

282 lines
11 KiB
Python

import os
import json
import datetime
import logging
import sys
import pandas as pd
from dateutil.relativedelta import relativedelta
import easygui
L = logging.getLogger(__name__)
DROPBOX = False
local_dropbox_path = "/Users/tyrelsouza/Dropbox (Addgene)/"
SF_DATE = "Date"
DP_DATE = "Date Received"
class AttributionReport(object):
def __init__(self, credentials_file, months=6, footer_length=None):
self.months = months
self.footer_length = footer_length
self.PI_COLUMN = "PI_Name"
self.ORG_COLUMN = "Org Name"
self.OUTPUT_COLUMN_ORDER = ["Addgene Assigned",
"Plasmid ID",
"Deposit ID",
"Institute",
"PI Name",
"Date Received",
"Original Date",
"Original ORG",
"Original PI"]
self.ACCEPTABLE_EXTENSIONS = ["*.csv",
"*.xls",
"*.xlsx"]
# columns that need to be in the files
self.REQUIRED_SF_COLUMNS = ["First Name",
"Last Name",
"Account Name",
"Date",
"Assigned"]
self.REQUIRED_DP_COLUMNS = ["Org Name",
"Deposit ID",
"Plasmid ID",
"PI_Name",
"Date Received"]
# After load and merging, delete these columns
self.SF_TRIM_COLUMNS = ["Subject",
"First Name",
"Last Name",
"Created Date",
"LIMS Organization ID",
"Account Description"]
self.DP_TRIM_COLUMNS = ["Org ID",
"Deposit Status",
"PI_ID",
"Date Available",
"# Orders",
"# Plasmids in the Deposit",
"Addgene Contact",
"Country"]
self.DEFAULT_DIR = self.get_dropbox_dir()
def get_dropbox_dir(self):
"""
Windows and Mac get dropox dir for Business or fallback to personal
"""
if os.name == "nt":
dropbox_file = os.path.join(os.getenv('APPDATA'), 'Dropbox', 'info.json')
else:
dropbox_file = os.path.expanduser("~/.dropbox/info.json")
with open(dropbox_file) as dbf:
dbconfig = json.loads(dbf.read())
if "business" in dbconfig:
dropbox_dir = dbconfig['business']['path'] + "/*.xls"
elif "personal" in dbconfig:
dropbox_dir = dbconfig['personal']['path'] + "/*.xls"
else:
dropbox_dir = os.path.expanduser("~")
return dropbox_dir
def _get_dataframe_by_extension(self, path, date_cols):
"""
Gets a dataframe either by .csv, or .xls(x),
or erroring and exiting.
"""
_, ext = os.path.splitext(path)
if ext == ".csv":
df = pd.read_csv(path, parse_dates=date_cols, encoding='utf-8')
elif ext in [".xlsx", ".xls"]:
df = pd.read_excel(path, parse_dates=date_cols, encoding='utf-8')
else:
easygui.msgbox("File was not of type {0}.\nQuitting".format(
" ".join(self.ACCEPTABLE_EXTENSIONS)),
"ERROR")
sys.exit(1)
return df
def get_dataframes(self):
"""
This gets the Salesforce and the Deposit dataframes.
Then it does some cleanup of the columns
"""
salesforce_df, deposit_df = self.get_files()
# Get rid of the footer that Salesforce adds.
if self.footer_length:
length_with_footer = len(salesforce_df.index)
salesforce_df = salesforce_df.head(length_with_footer - self.footer_length)
# Clean up Salesforce
salesforce_df.sort(SF_DATE, ascending=1)
salesforce_df["Full Name"] = salesforce_df["First Name"].map(unicode) + " " + salesforce_df["Last Name"]
# Cleanup Deposit Data
deposit_df['Org Name'].fillna('', inplace=True)
deposit_df.sort(DP_DATE, ascending=1)
deposit_df['PI_Name'].astype(unicode)
# Cleanup not needed columns
for col in self.SF_TRIM_COLUMNS:
del salesforce_df[col]
for col in self.DP_TRIM_COLUMNS:
del deposit_df[col]
return salesforce_df, deposit_df
def get_files(self):
salesforce_data_name = easygui.fileopenbox("Salesforce Export",
default=self.DEFAULT_DIR,
filetypes=self.ACCEPTABLE_EXTENSIONS)
if salesforce_data_name == ".":
easygui.msgbox("You did not select a Salesforce Export, stopping program.",
"Good Bye")
sys.exit(1)
salesforce_df = self._get_dataframe_by_extension(salesforce_data_name, date_cols=[4, 5])
if set(self.REQUIRED_SF_COLUMNS) < set(salesforce_df.columns):
L.info("Proper columns")
else:
L.info("Wrong columns")
easygui.msgbox("At a minimum, the Salesforce file must have the following columns:\n\n"
"{0}\n\n"
"Please re-run and select a proper file.".format(", ".join(self.REQUIRED_SF_COLUMNS)),
"Incorrect columns")
sys.exit(1)
deposit_data_name = easygui.fileopenbox("Deposit Data",
default=self.DEFAULT_DIR,
filetypes=self.ACCEPTABLE_EXTENSIONS)
if deposit_data_name == ".":
easygui.msgbox("You did not select a Deposit Data Export, stopping program.",
"Good Bye")
sys.exit(1)
deposit_df = self._get_dataframe_by_extension(deposit_data_name, date_cols=[7, 8])
if set(self.REQUIRED_DP_COLUMNS) < set(deposit_df.columns):
L.info("Proper columns")
else:
L.info("Wrong columns")
easygui.msgbox("At a minimum, the Deposit Data file must have the following columns:\n\n"
"{0}\n\n"
"Please re-run and select a proper file.".format(", ".join(self.REQUIRED_DP_COLUMNS)),
"Incorrect columns")
sys.exit(1)
return salesforce_df, deposit_df
def get_filtered(self, filtered_df, sf_row, pi_name, pi_org, kind):
if kind == "PI":
filter_column = self.PI_COLUMN
filter_value = pi_name
elif kind == "ORG":
filter_column = self.ORG_COLUMN
filter_value = pi_org
name_match = filtered_df[filtered_df[filter_column] == filter_value]
single = []
double = []
if not name_match.empty:
for _, row in name_match.iterrows():
data = {
"Addgene Assigned": sf_row['Assigned'],
"Plasmid ID": row['Plasmid ID'],
"Deposit ID": row['Deposit ID'],
"Institute": row['Org Name'],
"PI Name": row['PI_Name'],
"Date Received": row[DP_DATE],
"Original Date": sf_row[SF_DATE],
"Original ORG": pi_org,
"Original PI": pi_name,
}
if (data['Institute'] == data['Original ORG']) and \
(data['PI Name'] == data['Original PI']):
double.append(data)
else:
single.append(data)
return single, double
def get_attribution_dataframes(self):
salesforce, dep = self.get_dataframes()
name_matches = []
org_matches = []
double_matches = []
mismatches = []
# Iterate through the Salesforce report as the master document
for index, sf_row in salesforce.iterrows():
# Get a start date and an end date for filtering.
start_date = sf_row[SF_DATE]
end_date = start_date + relativedelta(months=self.months)
start = dep[DP_DATE].searchsorted(start_date)[0]
end = dep[DP_DATE].searchsorted(end_date)[0]
# Filter the deposit data to grab only things within that timeframe.
filtered_df = dep.ix[start:end]
# Variables for short names, and not having to type index a lot.
pi_name = unicode(sf_row['Full Name'])
pi_org = sf_row['Account Name']
# Get matches by the PI's name
by_name, by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org, kind="PI")
name_matches.extend(by_name)
mismatches.extend(by_name)
double_matches.extend(by_both)
# Get matches by the organization name
by_org, by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org, kind="ORG")
org_matches.extend(by_org)
mismatches.extend(by_org)
double_matches.extend(by_both)
return (
("PI", pd.DataFrame(name_matches, columns=self.OUTPUT_COLUMN_ORDER)),
("Institute", pd.DataFrame(org_matches, columns=self.OUTPUT_COLUMN_ORDER)),
("Double", pd.DataFrame(double_matches, columns=self.OUTPUT_COLUMN_ORDER)),
("Single", pd.DataFrame(mismatches, columns=self.OUTPUT_COLUMN_ORDER))
)
def run(self):
frames = self.get_attribution_dataframes()
dirname = easygui.diropenbox("Where to save reports?", "Select Report Output Directory", self.DEFAULT_DIR)
if not dirname:
dirname = self.DEFAULT_DIR
for key, df in frames:
fname = '{0}_Attribution_Report_{1}_Match.xlsx'.format(datetime.date.today(), key)
xls_path = os.path.join(dirname, fname)
print "Writing", xls_path
writer = pd.ExcelWriter(xls_path,
engine='xlsxwriter')
df.to_excel(writer,
sheet_name='Sheet1',
index=False)
writer.save()
def main():
try:
report = AttributionReport(credentials_file="credentials.json",
months=6,
footer_length=6)
report.run()
except:
easygui.exceptionbox()
if __name__ == '__main__':
main()