diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a49bc4d --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.pyc +.idea +build* diff --git a/attribution.py b/attribution.py new file mode 100644 index 0000000..328168a --- /dev/null +++ b/attribution.py @@ -0,0 +1,221 @@ +# -*- coding: utf-8 -*- +import os +import datetime +import logging + +import pandas as pd +from dateutil.relativedelta import relativedelta + +L = logging.getLogger(__name__) + + +class AttributionReport(object): + def __init__(self, months=6, footer_length=None): + self.months = months + self.footer_length = footer_length + + self.SF_DATE_COLUMN = "Date" + self.DP_DATE_COLUMN = "Date Received" + self.PI_COLUMN = "PI_Name" + self.ORG_COLUMN = "Org Name" + # Output the XLSX in this order + + self.OUTPUT_COLUMN_ORDER = ["Addgene Assigned", "Plasmid ID", "Deposit ID", "Institute", "PI Name", + "Date Received", "Original Date", "Original ORG", "Original PI"] + + self.ACCEPTABLE_EXTENSIONS = ["*.csv", "*.xls", "*.xlsx"] + + # columns that need to be in the files + self.REQUIRED_SF_COLUMNS = ["First Name", "Last Name", "Account Name", "Date", "Assigned"] + self.REQUIRED_DP_COLUMNS = ["Org Name", "Deposit ID", "Plasmid ID", "PI_Name", "Date Received"] + + # After load and merging, delete these columns + self.SF_TRIM_COLUMNS = ["Subject", "Created Date", "LIMS Organization ID", + "Account Description"] + self.DP_TRIM_COLUMNS = ["Org ID", "Deposit Status", "PI_ID", "Date Available", "# Orders", + "# Plasmids in the Deposit", "Addgene Contact", "Country"] + + self.salesforce_df = None + self.deposit_df = None + self.output_dir = None + self.frames = None + + def _get_dataframe_by_extension(self, path, date_cols): + """ + Gets a dataframe either by .csv, or .xls(x), + or erroring and exiting. + """ + _, ext = os.path.splitext(path) + + if ext == ".csv": + df = pd.read_csv(path, parse_dates=date_cols, encoding='utf-8') + elif ext in [".xlsx", ".xls"]: + df = pd.read_excel(path, parse_dates=date_cols, encoding='utf-8') + else: + raise Exception("File was not of type {0}.\nQuitting".format( + " ".join(self.ACCEPTABLE_EXTENSIONS))) + return df + + def set_dataframe_sf(self, fname): + self.salesforce_df = None + try: + salesforce_df = self._get_dataframe_by_extension(fname, date_cols=[self.SF_DATE_COLUMN, ]) + except IndexError: + return False + except ValueError: + return False + except: + raise + + if set(self.REQUIRED_SF_COLUMNS) < set(salesforce_df.columns): + self.salesforce_df = salesforce_df + return True + L.info("Wrong columns") + return False + + def set_dataframe_deposit(self, fname): + self.deposit_df = None + try: + deposit_df = self._get_dataframe_by_extension(fname, date_cols=[self.DP_DATE_COLUMN, ]) + except IndexError: + return False + except ValueError: + return False + except: + raise + if set(self.REQUIRED_DP_COLUMNS) < set(deposit_df.columns): + self.deposit_df = deposit_df + return True + L.info("Wrong columns") + return False + + def set_output_dir(self, dir): + self.output_dir = dir + + def get_dataframes(self): + salesforce_df, deposit_df = self.clean_dataframes() + return salesforce_df, deposit_df + + def clean_dataframes(self): + # Get rid of the footer that Salesforce adds. + if self.footer_length: + length_with_footer = len(self.salesforce_df.index) + self.salesforce_df = self.salesforce_df.head(length_with_footer - self.footer_length) + + # Clean up Salesforce + self.salesforce_df.sort(self.SF_DATE_COLUMN, ascending=1) + + # Cleanup Deposit Data + self.deposit_df['Org Name'].fillna('', inplace=True) + self.deposit_df.sort(self.DP_DATE_COLUMN, ascending=1) + self.deposit_df['PI_Name'].astype(unicode) + + # Cleanup not needed columns + for col in self.SF_TRIM_COLUMNS: + del self.salesforce_df[col] + for col in self.DP_TRIM_COLUMNS: + del self.deposit_df[col] + + def get_filtered(self, filtered_df, sf_row, pi_name, pi_org, org=False): + """ + Assume kind is PI by default. + Filter where either the PI and PI match, or the Org and Org match + If both match, add it to the the double list + if only one matches, add it to the single list. + """ + filter_column = self.PI_COLUMN + filter_value = pi_name + single, double = [], [] + + if org: + filter_column = self.ORG_COLUMN + filter_value = pi_org + + name_match = filtered_df[filtered_df[filter_column] == filter_value] + + if not name_match.empty: + for _, row in name_match.iterrows(): + data = { + "Addgene Assigned": sf_row['Assigned'], + "Plasmid ID": row['Plasmid ID'], + "Deposit ID": row['Deposit ID'], + "Institute": row['Org Name'], + "PI Name": row['PI_Name'], + "Date Received": row[self.DP_DATE_COLUMN], + "Original Date": sf_row[self.SF_DATE_COLUMN], + "Original ORG": pi_org, + "Original PI": pi_name, + } + if (data['Institute'] == data['Original ORG']) and \ + (data['PI Name'] == data['Original PI']): + double.append(data) + else: + single.append(data) + return single, double + + def get_attribution_dataframes(self): + self.clean_dataframes() + + name_matches = [] + org_matches = [] + double_matches = [] + mismatches = [] + + # Iterate through the Salesforce report as the master document + for index, sf_row in self.salesforce_df.iterrows(): + # Get a start date and an end date for filtering. + start_date = sf_row[self.SF_DATE_COLUMN] + end_date = start_date + relativedelta(months=self.months) + + start = self.deposit_df[self.DP_DATE_COLUMN].searchsorted(start_date)[0] + end = self.deposit_df[self.DP_DATE_COLUMN].searchsorted(end_date)[0] + + # Filter the deposit data to grab only things within that timeframe. + filtered_df = self.deposit_df.ix[start:end] + + # Variables for short names, and not having to type index a lot. + pi_name = unicode(sf_row['First Name']) + " " + unicode(sf_row['Last Name']) + pi_org = sf_row['Account Name'] + + # Get matches by the PI's name + by_name, pi_by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org) + name_matches.extend(by_name) + mismatches.extend(by_name) + double_matches.extend(pi_by_both) + + # Get matches by the organization name + by_org, org_by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org, org=True) + org_matches.extend(by_org) + mismatches.extend(by_org) + double_matches.extend(org_by_both) + + return ( + ("PI", pd.DataFrame(name_matches, columns=self.OUTPUT_COLUMN_ORDER)), + ("Institute", pd.DataFrame(org_matches, columns=self.OUTPUT_COLUMN_ORDER)), + ("Double", pd.DataFrame(double_matches, columns=self.OUTPUT_COLUMN_ORDER)), + ("Single", pd.DataFrame(mismatches, columns=self.OUTPUT_COLUMN_ORDER)) + ) + + def run(self): + self.frames = None + self.frames = self.get_attribution_dataframes() + + def save(self): + for key, df in self.frames: + fname = '{0}_Attribution_Report_{1}_Match.xlsx'.format(datetime.date.today(), key) + + output_path = os.path.join(self.output_dir, fname) + + deduped_df = df.drop_duplicates() + + with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer: + deduped_df.to_excel(writer, sheet_name='Sheet1', index=False) + +if __name__ == '__main__': + app = AttributionReport(months=6, footer_length=6) + app.set_dataframe_deposit("/Users/tyrelsouza/Dropbox (Addgene)/Addgene Shared/Dev/Attribution Report/deposit_data.csv") + app.set_dataframe_sf("/Users/tyrelsouza/Dropbox (Addgene)/Addgene Shared/Dev/Attribution Report/salesforce_report.xlsx") + app.set_output_dir("/Users/tyrelsouza/Dropbox (Addgene)/Addgene Shared/Dev/Attribution Report/Output/") + app.run() + app.save() + diff --git a/main.py b/main.py deleted file mode 100644 index f569157..0000000 --- a/main.py +++ /dev/null @@ -1,170 +0,0 @@ -import os -import json - -import pandas as pd -import dropbox -from dateutil.relativedelta import relativedelta - -DROPBOX = False -local_dropbox_path = "/Users/tyrelsouza/Dropbox (Addgene)/" - -SF_DATE = "Date" -DP_DATE = "Date Received" - - -class AttributionReport(object): - def __init__(self, credentials_file, months=6, footer_length=None): - self.months = months - self.footer_length = footer_length - self.PI_COLUMN = "PI_Name" - self.ORG_COLUMN = "Org Name" - - with open(credentials_file, "r") as cred_f: - creds = json.loads(cred_f.read()) - - self.app_key = creds['app_key'] - self.app_secret = creds['app_secret'] - - if not creds.get("access_token", None): - self.authorize() - else: - self.access_token = creds['access_token'] - self.user_id = creds['user_id'] - - def authorize(self): - flow = dropbox.client.DropboxOAuth2FlowNoRedirect(self.app_key, self.app_secret) - authorize_url = flow.start() - print '1. Go to: ' + authorize_url - print '2. Click "Allow" (you might have to log in first)' - print '3. Copy the authorization code.' - code = raw_input("Enter the authorization code here: ").strip() - access_token, user_id = flow.finish(code) - self.access_token = access_token - self.user_id = user_id - - creds = {"app_key": self.app_key, - "app_secret": self.app_secret, - "access_token": self.access_token, - "user_id": self.user_id} - - # Save so we don't have to do this again. - with open("credentials.json", "w") as f: - f.write(json.dumps(creds)) - - def _open_file_frame(self, filename, date_cols): - if DROPBOX: - client = dropbox.client.DropboxClient(self.access_token) - f = client.get_file(filename) - else: - f = os.path.normpath(local_dropbox_path + filename) - - if filename[-4:] == ".csv": - df = pd.read_csv(f, parse_dates=date_cols, encoding='utf-8') - else: - df = pd.read_excel(f, parse_dates=date_cols, encoding='utf-8') - return df - - def get_dataframes(self): - """ - This gets the Salesforce and the Deposit dataframes. - Then it does some cleanup of the columns - """ - salesforce_data_name = '/Addgene Shared/Dev/Attribution Report/salesforce_report.xlsx' - salesforce_df = self._open_file_frame(salesforce_data_name, date_cols=[4, 5]) - - if self.footer_length: - length_with_footer = len(salesforce_df.index) - salesforce_df = salesforce_df.head(length_with_footer - self.footer_length) - - deposit_data_name = 'Addgene Shared/Dev/Attribution Report/deposit_data.csv' - deposit_df = self._open_file_frame(deposit_data_name, date_cols=[7, 8]) - - # Clean up Salesforce - salesforce_df['Account Description'].fillna('', inplace=True) - salesforce_df.sort(SF_DATE, ascending=1) - salesforce_df["Full Name"] = salesforce_df["First Name"].map(unicode) + " " + salesforce_df["Last Name"] - del salesforce_df["First Name"] - del salesforce_df["Last Name"] - - # Cleanup Deposit Data - deposit_df['Org Name'].fillna('', inplace=True) - deposit_df.sort(DP_DATE, ascending=1) - deposit_df['PI_Name'].astype(unicode) - - return salesforce_df, deposit_df - - def get_filtered(self, filtered_df, sf_row, pi_name, pi_org, kind): - if kind == "PI": - filter_column = self.PI_COLUMN - filter_value = pi_name - elif kind == "ORG": - filter_column = self.ORG_COLUMN - filter_value = pi_org - - name_match = filtered_df[filtered_df[filter_column] == filter_value] - - output = [] - if not name_match.empty: - for _, row in name_match.iterrows(): - data = { - "Addgene Assigned": sf_row['Assigned'], - "Plasmid ID": row['Plasmid ID'], - "Deposit ID": row['Deposit ID'], - "Institute": row['Org Name'], - "PI Name": row['PI_Name'], - "Date Received": row[DP_DATE], - "Original Date": sf_row[SF_DATE], - "Original ORG": pi_org, - "Original PI": pi_name, - } - output.append(data) - return output - - def get_attribution_dataframes(self): - salesforce, dep = self.get_dataframes() - name_matches = [] - org_matches = [] - - # Iterate through the Salesforce report as the master document - for index, sf_row in salesforce.iterrows(): - # Get a start date and an end date for filtering. - start_date = sf_row[SF_DATE] - end_date = start_date + relativedelta(months=self.months) - - start = dep[DP_DATE].searchsorted(start_date)[0] - end = dep[DP_DATE].searchsorted(end_date)[0] - - # Filter the deposit data to grab only things within that timeframe. - filtered_df = dep.ix[start:end] - - # Variables for short names, and not having to type index a lot. - pi_name = unicode(sf_row['Full Name']) - pi_org = sf_row['Account Name'] - - # Get matches by the PI's name - by_name = self.get_filtered(filtered_df, - sf_row, - pi_name, - pi_org, - kind="PI") - name_matches.extend(by_name) - - # Get matches by the organization name - by_org = self.get_filtered(filtered_df, - sf_row, - pi_name, - pi_org, - kind="ORG") - org_matches.extend(by_org) - return pd.DataFrame(name_matches), pd.DataFrame(org_matches) - - def run(self): - name_df, org_df = self.get_attribution_dataframes() - name_df.to_excelv("names.xls") - org_df.to_excel("orgs.xls") - -if __name__ == '__main__': - report = AttributionReport(credentials_file="credentials.json", - months=6, - footer_length=6) - report.run() diff --git a/requirements.txt b/requirements.txt index 5bd0648..dd5fedd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -dropbox -pandas -xlrd -python-dateutil \ No newline at end of file +XlsxWriter==0.7.3 +pandas==0.16.2 +python-dateutil==2.4.2 +xlrd==0.9.4 diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..0891f6a --- /dev/null +++ b/utils.py @@ -0,0 +1,25 @@ +__author__ = 'tyrelsouza' +import os +import json + + +def get_dropbox_dir(): + """ + Windows and Mac get dropox dir for Business or fallback to personal + """ + if os.name == "nt": + dropbox_file = os.path.join(os.getenv('APPDATA'), 'Dropbox', 'info.json') + else: + dropbox_file = os.path.expanduser("~/.dropbox/info.json") + + with open(dropbox_file) as dbf: + dbconfig = json.loads(dbf.read()) + + if "business" in dbconfig: + dropbox_dir = dbconfig['business']['path'] + elif "personal" in dbconfig: + dropbox_dir = dbconfig['personal']['path'] + else: + dropbox_dir = os.path.expanduser("~") + + return dropbox_dir