import os import json import datetime import logging import sys import pandas as pd from dateutil.relativedelta import relativedelta import easygui L = logging.getLogger(__name__) class AttributionReport(object): def __init__(self, credentials_file, months=6, footer_length=None): self.months = months self.footer_length = footer_length self.SF_DATE_COLUMN = "Date" self.DP_DATE_COLUMN = "Date Received" self.PI_COLUMN = "PI_Name" self.ORG_COLUMN = "Org Name" # Output the XLSX in this order self.OUTPUT_COLUMN_ORDER = ["Addgene Assigned", "Plasmid ID", "Deposit ID", "Institute", "PI Name", "Date Received", "Original Date", "Original ORG", "Original PI"] self.ACCEPTABLE_EXTENSIONS = ["*.csv", "*.xls", "*.xlsx"] # columns that need to be in the files self.REQUIRED_SF_COLUMNS = ["First Name", "Last Name", "Account Name", "Date", "Assigned"] self.REQUIRED_DP_COLUMNS = ["Org Name", "Deposit ID", "Plasmid ID", "PI_Name", "Date Received"] # After load and merging, delete these columns self.SF_TRIM_COLUMNS = ["Subject", "First Name", "Last Name", "Created Date", "LIMS Organization ID", "Account Description"] self.DP_TRIM_COLUMNS = ["Org ID", "Deposit Status", "PI_ID", "Date Available", "# Orders", "# Plasmids in the Deposit", "Addgene Contact", "Country"] self.DEFAULT_DIR = self.get_dropbox_dir() def get_dropbox_dir(self): """ Windows and Mac get dropox dir for Business or fallback to personal """ if os.name == "nt": dropbox_file = os.path.join(os.getenv('APPDATA'), 'Dropbox', 'info.json') else: dropbox_file = os.path.expanduser("~/.dropbox/info.json") with open(dropbox_file) as dbf: dbconfig = json.loads(dbf.read()) if "business" in dbconfig: dropbox_dir = dbconfig['business']['path'] + "/*.xls" elif "personal" in dbconfig: dropbox_dir = dbconfig['personal']['path'] + "/*.xls" else: dropbox_dir = os.path.expanduser("~") return dropbox_dir def _get_dataframe_by_extension(self, path, date_cols): """ Gets a dataframe either by .csv, or .xls(x), or erroring and exiting. """ _, ext = os.path.splitext(path) if ext == ".csv": df = pd.read_csv(path, parse_dates=date_cols, encoding='utf-8') elif ext in [".xlsx", ".xls"]: df = pd.read_excel(path, parse_dates=date_cols, encoding='utf-8') else: easygui.msgbox("File was not of type {0}.\nQuitting".format( " ".join(self.ACCEPTABLE_EXTENSIONS)), "ERROR") sys.exit(1) return df def get_dataframes(self): """ This gets the Salesforce and the Deposit dataframes. Then it does some cleanup of the columns """ salesforce_df, deposit_df = self.get_files() # Get rid of the footer that Salesforce adds. if self.footer_length: length_with_footer = len(salesforce_df.index) salesforce_df = salesforce_df.head(length_with_footer - self.footer_length) # Clean up Salesforce salesforce_df.sort(self.SF_DATE_COLUMN, ascending=1) salesforce_df["Full Name"] = salesforce_df["First Name"].map(unicode) + " " + salesforce_df["Last Name"] # Cleanup Deposit Data deposit_df['Org Name'].fillna('', inplace=True) deposit_df.sort(self.DP_DATE_COLUMN, ascending=1) deposit_df['PI_Name'].astype(unicode) # Cleanup not needed columns for col in self.SF_TRIM_COLUMNS: del salesforce_df[col] for col in self.DP_TRIM_COLUMNS: del deposit_df[col] return salesforce_df, deposit_df def get_files(self): salesforce_data_name = easygui.fileopenbox("Salesforce Export", default=self.DEFAULT_DIR, filetypes=self.ACCEPTABLE_EXTENSIONS) if salesforce_data_name == ".": easygui.msgbox("You did not select a Salesforce Export, stopping program.", "Good Bye") sys.exit(1) salesforce_df = self._get_dataframe_by_extension(salesforce_data_name, date_cols=[4, 5]) if set(self.REQUIRED_SF_COLUMNS) < set(salesforce_df.columns): L.info("Proper columns") else: L.info("Wrong columns") easygui.msgbox("At a minimum, the Salesforce file must have the following columns:\n\n" "{0}\n\n" "Please re-run and select a proper file.".format(", ".join(self.REQUIRED_SF_COLUMNS)), "Incorrect columns") sys.exit(1) deposit_data_name = easygui.fileopenbox("Deposit Data", default=self.DEFAULT_DIR, filetypes=self.ACCEPTABLE_EXTENSIONS) if deposit_data_name == ".": easygui.msgbox("You did not select a Deposit Data Export, stopping program.", "Good Bye") sys.exit(1) deposit_df = self._get_dataframe_by_extension(deposit_data_name, date_cols=[7, 8]) if set(self.REQUIRED_DP_COLUMNS) < set(deposit_df.columns): L.info("Proper columns") else: L.info("Wrong columns") easygui.msgbox("At a minimum, the Deposit Data file must have the following columns:\n\n" "{0}\n\n" "Please re-run and select a proper file.".format(", ".join(self.REQUIRED_DP_COLUMNS)), "Incorrect columns") sys.exit(1) return salesforce_df, deposit_df def get_filtered(self, filtered_df, sf_row, pi_name, pi_org, kind): if kind == "PI": filter_column = self.PI_COLUMN filter_value = pi_name elif kind == "ORG": filter_column = self.ORG_COLUMN filter_value = pi_org name_match = filtered_df[filtered_df[filter_column] == filter_value] single = [] double = [] if not name_match.empty: for _, row in name_match.iterrows(): data = { "Addgene Assigned": sf_row['Assigned'], "Plasmid ID": row['Plasmid ID'], "Deposit ID": row['Deposit ID'], "Institute": row['Org Name'], "PI Name": row['PI_Name'], "Date Received": row[self.DP_DATE_COLUMN], "Original Date": sf_row[self.SF_DATE_COLUMN], "Original ORG": pi_org, "Original PI": pi_name, } if (data['Institute'] == data['Original ORG']) and \ (data['PI Name'] == data['Original PI']): double.append(data) else: single.append(data) return single, double def get_attribution_dataframes(self): salesforce, dep = self.get_dataframes() name_matches = [] org_matches = [] double_matches = [] mismatches = [] # Iterate through the Salesforce report as the master document for index, sf_row in salesforce.iterrows(): # Get a start date and an end date for filtering. start_date = sf_row[self.SF_DATE_COLUMN] end_date = start_date + relativedelta(months=self.months) start = dep[self.DP_DATE_COLUMN].searchsorted(start_date)[0] end = dep[self.DP_DATE_COLUMN].searchsorted(end_date)[0] # Filter the deposit data to grab only things within that timeframe. filtered_df = dep.ix[start:end] # Variables for short names, and not having to type index a lot. pi_name = unicode(sf_row['Full Name']) pi_org = sf_row['Account Name'] # Get matches by the PI's name by_name, by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org, kind="PI") name_matches.extend(by_name) mismatches.extend(by_name) double_matches.extend(by_both) # Get matches by the organization name by_org, by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org, kind="ORG") org_matches.extend(by_org) mismatches.extend(by_org) double_matches.extend(by_both) return ( ("PI", pd.DataFrame(name_matches, columns=self.OUTPUT_COLUMN_ORDER)), ("Institute", pd.DataFrame(org_matches, columns=self.OUTPUT_COLUMN_ORDER)), ("Double", pd.DataFrame(double_matches, columns=self.OUTPUT_COLUMN_ORDER)), ("Single", pd.DataFrame(mismatches, columns=self.OUTPUT_COLUMN_ORDER)) ) def run(self): frames = self.get_attribution_dataframes() dirname = easygui.diropenbox("Where to save reports?", "Select Report Output Directory", self.DEFAULT_DIR) if not dirname: dirname = self.DEFAULT_DIR for key, df in frames: fname = '{0}_Attribution_Report_{1}_Match.xlsx'.format(datetime.date.today(), key) xls_path = os.path.join(dirname, fname) print "Writing", xls_path writer = pd.ExcelWriter(xls_path, engine='xlsxwriter') df.to_excel(writer, sheet_name='Sheet1', index=False) writer.save() def main(): try: report = AttributionReport(credentials_file="credentials.json", months=6, footer_length=6) report.run() except: easygui.exceptionbox() if __name__ == '__main__': main()