import os import json import datetime import logging import pandas as pd import dropbox from dateutil.relativedelta import relativedelta import easygui L = logging.getLogger(__name__) DROPBOX = False local_dropbox_path = "/Users/tyrelsouza/Dropbox (Addgene)/" SF_DATE = "Date" DP_DATE = "Date Received" class AttributionReport(object): def __init__(self, credentials_file, months=6, footer_length=None): self.months = months self.footer_length = footer_length self.PI_COLUMN = "PI_Name" self.ORG_COLUMN = "Org Name" self.column_order = ["Addgene Assigned", "Plasmid ID", "Deposit ID", "Institute", "PI Name", "Date Received", "Original Date", "Original ORG", "Original PI"] with open(credentials_file, "r") as cred_f: creds = json.loads(cred_f.read()) self.app_key = creds['app_key'] self.app_secret = creds['app_secret'] if not creds.get("access_token", None): self.authorize() else: self.access_token = creds['access_token'] self.user_id = creds['user_id'] def authorize(self): flow = dropbox.client.DropboxOAuth2FlowNoRedirect(self.app_key, self.app_secret) authorize_url = flow.start() print '1. Go to: ' + authorize_url print '2. Click "Allow" (you might have to log in first)' print '3. Copy the authorization code.' code = raw_input("Enter the authorization code here: ").strip() access_token, user_id = flow.finish(code) self.access_token = access_token self.user_id = user_id creds = {"app_key": self.app_key, "app_secret": self.app_secret, "access_token": self.access_token, "user_id": self.user_id} # Save so we don't have to do this again. with open("credentials.json", "w") as f: f.write(json.dumps(creds)) def _open_file_frame(self, filename, date_cols): if DROPBOX: L.info("Getting file from Dropbox") client = dropbox.client.DropboxClient(self.access_token) L.info("Got file from Dropbox") f = client.get_file(filename) else: f = os.path.normpath(local_dropbox_path + filename) if filename[-4:] == ".csv": df = pd.read_csv(f, parse_dates=date_cols, encoding='utf-8') else: df = pd.read_excel(f, parse_dates=date_cols, encoding='utf-8') return df def get_dataframes(self): """ This gets the Salesforce and the Deposit dataframes. Then it does some cleanup of the columns """ salesforce_data_name = '/Addgene Shared/Dev/Attribution Report/salesforce_report.xlsx' salesforce_df = self._open_file_frame(salesforce_data_name, date_cols=[4, 5]) if self.footer_length: length_with_footer = len(salesforce_df.index) salesforce_df = salesforce_df.head(length_with_footer - self.footer_length) deposit_data_name = 'Addgene Shared/Dev/Attribution Report/deposit_data.csv' deposit_df = self._open_file_frame(deposit_data_name, date_cols=[7, 8]) # Clean up Salesforce salesforce_df['Account Description'].fillna('', inplace=True) salesforce_df.sort(SF_DATE, ascending=1) salesforce_df["Full Name"] = salesforce_df["First Name"].map(unicode) + " " + salesforce_df["Last Name"] del salesforce_df["First Name"] del salesforce_df["Last Name"] # Cleanup Deposit Data deposit_df['Org Name'].fillna('', inplace=True) deposit_df.sort(DP_DATE, ascending=1) deposit_df['PI_Name'].astype(unicode) return salesforce_df, deposit_df def get_filtered(self, filtered_df, sf_row, pi_name, pi_org, kind): if kind == "PI": filter_column = self.PI_COLUMN filter_value = pi_name elif kind == "ORG": filter_column = self.ORG_COLUMN filter_value = pi_org name_match = filtered_df[filtered_df[filter_column] == filter_value] single = [] double = [] if not name_match.empty: for _, row in name_match.iterrows(): data = { "Addgene Assigned": sf_row['Assigned'], "Plasmid ID": row['Plasmid ID'], "Deposit ID": row['Deposit ID'], "Institute": row['Org Name'], "PI Name": row['PI_Name'], "Date Received": row[DP_DATE], "Original Date": sf_row[SF_DATE], "Original ORG": pi_org, "Original PI": pi_name, } if (data['Institute'] == data['Original ORG']) and \ (data['PI Name'] == data['Original PI']): double.append(data) else: single.append(data) return single, double def get_attribution_dataframes(self): salesforce, dep = self.get_dataframes() name_matches = [] org_matches = [] double_matches = [] mismatches = [] # Iterate through the Salesforce report as the master document for index, sf_row in salesforce.iterrows(): # Get a start date and an end date for filtering. start_date = sf_row[SF_DATE] end_date = start_date + relativedelta(months=self.months) start = dep[DP_DATE].searchsorted(start_date)[0] end = dep[DP_DATE].searchsorted(end_date)[0] # Filter the deposit data to grab only things within that timeframe. filtered_df = dep.ix[start:end] # Variables for short names, and not having to type index a lot. pi_name = unicode(sf_row['Full Name']) pi_org = sf_row['Account Name'] # Get matches by the PI's name by_name, by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org, kind="PI") name_matches.extend(by_name) mismatches.extend(by_name) double_matches.extend(by_both) # Get matches by the organization name by_org, by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org, kind="ORG") org_matches.extend(by_org) mismatches.extend(by_org) double_matches.extend(by_both) return {"PI": pd.DataFrame(name_matches, columns=self.column_order), "Institute": pd.DataFrame(org_matches, columns=self.column_order), "Double": pd.DataFrame(double_matches, columns=self.column_order), "Single": pd.DataFrame(mismatches, columns=self.column_order)} def run(self): frames = self.get_attribution_dataframes() for key, df in frames.items(): fname = '{0}_Attribution_Report_{1}_Match.xlsx'.format( datetime.date.today(), key) print "Writing", fname writer = pd.ExcelWriter( fname, engine='xlsxwriter') df.to_excel(writer, sheet_name='Sheet1', index=False) writer.save() if __name__ == '__main__': report = AttributionReport(credentials_file="credentials.json", months=6, footer_length=6) report.run()