# -*- coding: utf-8 -*- import os import datetime import logging import pandas as pd from dateutil.relativedelta import relativedelta L = logging.getLogger(__name__) class AttributionReport(object): def __init__(self, months=6, footer_length=None): self.months = months self.footer_length = footer_length self.SF_DATE_COLUMN = "Date" self.DP_DATE_COLUMN = "Date Received" self.PI_COLUMN = "PI_Name" self.ORG_COLUMN = "Org Name" # Output the XLSX in this order self.OUTPUT_COLUMN_ORDER = ["Addgene Assigned", "Plasmid ID", "Deposit ID", "Institute", "PI Name", "Date Received", "Original Date", "Original ORG", "Original PI"] self.ACCEPTABLE_EXTENSIONS = ["*.csv", "*.xls", "*.xlsx"] # columns that need to be in the files self.REQUIRED_SF_COLUMNS = ["First Name", "Last Name", "Account Name", "Date", "Assigned"] self.REQUIRED_DP_COLUMNS = ["Org Name", "Deposit ID", "Plasmid ID", "PI_Name", "Date Received"] # After load and merging, delete these columns self.SF_TRIM_COLUMNS = ["Subject", "Created Date", "LIMS Organization ID", "Account Description"] self.DP_TRIM_COLUMNS = ["Org ID", "Deposit Status", "PI_ID", "Date Available", "# Orders", "# Plasmids in the Deposit", "Addgene Contact", "Country"] self.salesforce_df = None self.deposit_df = None self.output_dir = None self.frames = None def _get_dataframe_by_extension(self, path, date_cols): """ Gets a dataframe either by .csv, or .xls(x), or erroring and exiting. """ _, ext = os.path.splitext(path) if ext == ".csv": df = pd.read_csv(path, parse_dates=date_cols, encoding='utf-8') elif ext in [".xlsx", ".xls"]: df = pd.read_excel(path, parse_dates=date_cols, encoding='utf-8') else: raise Exception("File was not of type {0}.\nQuitting".format( " ".join(self.ACCEPTABLE_EXTENSIONS))) return df def set_dataframe_sf(self, fname): self.salesforce_df = None try: salesforce_df = self._get_dataframe_by_extension(fname, date_cols=[self.SF_DATE_COLUMN, ]) except IndexError: return False except ValueError: return False except: raise if set(self.REQUIRED_SF_COLUMNS) < set(salesforce_df.columns): self.salesforce_df = salesforce_df return True L.info("Wrong columns") return False def set_dataframe_deposit(self, fname): self.deposit_df = None try: deposit_df = self._get_dataframe_by_extension(fname, date_cols=[self.DP_DATE_COLUMN, ]) except IndexError: return False except ValueError: return False except: raise if set(self.REQUIRED_DP_COLUMNS) < set(deposit_df.columns): self.deposit_df = deposit_df return True L.info("Wrong columns") return False def set_output_dir(self, dir): self.output_dir = dir def get_dataframes(self): salesforce_df, deposit_df = self.clean_dataframes() return salesforce_df, deposit_df def clean_dataframes(self): # Get rid of the footer that Salesforce adds. if self.footer_length: length_with_footer = len(self.salesforce_df.index) self.salesforce_df = self.salesforce_df.head(length_with_footer - self.footer_length) # Clean up Salesforce self.salesforce_df.sort(self.SF_DATE_COLUMN, ascending=1) # Cleanup Deposit Data self.deposit_df['Org Name'].fillna('', inplace=True) self.deposit_df.sort(self.DP_DATE_COLUMN, ascending=1) self.deposit_df['PI_Name'].astype(unicode) # Cleanup not needed columns for col in self.SF_TRIM_COLUMNS: del self.salesforce_df[col] for col in self.DP_TRIM_COLUMNS: del self.deposit_df[col] def get_filtered(self, filtered_df, sf_row, pi_name, pi_org, org=False): """ Assume kind is PI by default. Filter where either the PI and PI match, or the Org and Org match If both match, add it to the the double list if only one matches, add it to the single list. """ filter_column = self.PI_COLUMN filter_value = pi_name single, double = [], [] if org: filter_column = self.ORG_COLUMN filter_value = pi_org name_match = filtered_df[filtered_df[filter_column] == filter_value] if not name_match.empty: for _, row in name_match.iterrows(): data = { "Addgene Assigned": sf_row['Assigned'], "Plasmid ID": row['Plasmid ID'], "Deposit ID": row['Deposit ID'], "Institute": row['Org Name'], "PI Name": row['PI_Name'], "Date Received": row[self.DP_DATE_COLUMN], "Original Date": sf_row[self.SF_DATE_COLUMN], "Original ORG": pi_org, "Original PI": pi_name, } if (data['Institute'] == data['Original ORG']) and \ (data['PI Name'] == data['Original PI']): double.append(data) else: single.append(data) return single, double def get_attribution_dataframes(self): self.clean_dataframes() name_matches = [] org_matches = [] double_matches = [] mismatches = [] # Iterate through the Salesforce report as the master document for index, sf_row in self.salesforce_df.iterrows(): # Get a start date and an end date for filtering. start_date = sf_row[self.SF_DATE_COLUMN] end_date = start_date + relativedelta(months=self.months) start = self.deposit_df[self.DP_DATE_COLUMN].searchsorted(start_date)[0] end = self.deposit_df[self.DP_DATE_COLUMN].searchsorted(end_date)[0] # Filter the deposit data to grab only things within that timeframe. filtered_df = self.deposit_df.ix[start:end] # Variables for short names, and not having to type index a lot. pi_name = unicode(sf_row['First Name']) + " " + unicode(sf_row['Last Name']) pi_org = sf_row['Account Name'] # Get matches by the PI's name by_name, pi_by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org) name_matches.extend(by_name) mismatches.extend(by_name) double_matches.extend(pi_by_both) # Get matches by the organization name by_org, org_by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org, org=True) org_matches.extend(by_org) mismatches.extend(by_org) double_matches.extend(org_by_both) return ( ("PI", pd.DataFrame(name_matches, columns=self.OUTPUT_COLUMN_ORDER)), ("Institute", pd.DataFrame(org_matches, columns=self.OUTPUT_COLUMN_ORDER)), ("Double", pd.DataFrame(double_matches, columns=self.OUTPUT_COLUMN_ORDER)), ("Single", pd.DataFrame(mismatches, columns=self.OUTPUT_COLUMN_ORDER)) ) def run(self): self.frames = None self.frames = self.get_attribution_dataframes() def save(self): for key, df in self.frames: fname = '{0}_Attribution_Report_{1}_Match.xlsx'.format(datetime.date.today(), key) output_path = os.path.join(self.output_dir, fname) deduped_df = df.drop_duplicates() with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer: deduped_df.to_excel(writer, sheet_name='Sheet1', index=False) if __name__ == '__main__': app = AttributionReport(months=6, footer_length=6) app.set_dataframe_deposit("/Users/tyrelsouza/Dropbox (Addgene)/Addgene Shared/Dev/Attribution Report/deposit_data.csv") app.set_dataframe_sf("/Users/tyrelsouza/Dropbox (Addgene)/Addgene Shared/Dev/Attribution Report/salesforce_report.xlsx") app.set_output_dir("/Users/tyrelsouza/Dropbox (Addgene)/Addgene Shared/Dev/Attribution Report/Output/") app.run() app.save()