commit 53ee1caeeea458c5d821d8fbe614bb59eba1fce7 Author: Tyrel Souza Date: Thu Jul 16 14:13:45 2015 -0400 Initial commit diff --git a/main.py b/main.py new file mode 100644 index 0000000..f569157 --- /dev/null +++ b/main.py @@ -0,0 +1,170 @@ +import os +import json + +import pandas as pd +import dropbox +from dateutil.relativedelta import relativedelta + +DROPBOX = False +local_dropbox_path = "/Users/tyrelsouza/Dropbox (Addgene)/" + +SF_DATE = "Date" +DP_DATE = "Date Received" + + +class AttributionReport(object): + def __init__(self, credentials_file, months=6, footer_length=None): + self.months = months + self.footer_length = footer_length + self.PI_COLUMN = "PI_Name" + self.ORG_COLUMN = "Org Name" + + with open(credentials_file, "r") as cred_f: + creds = json.loads(cred_f.read()) + + self.app_key = creds['app_key'] + self.app_secret = creds['app_secret'] + + if not creds.get("access_token", None): + self.authorize() + else: + self.access_token = creds['access_token'] + self.user_id = creds['user_id'] + + def authorize(self): + flow = dropbox.client.DropboxOAuth2FlowNoRedirect(self.app_key, self.app_secret) + authorize_url = flow.start() + print '1. Go to: ' + authorize_url + print '2. Click "Allow" (you might have to log in first)' + print '3. Copy the authorization code.' + code = raw_input("Enter the authorization code here: ").strip() + access_token, user_id = flow.finish(code) + self.access_token = access_token + self.user_id = user_id + + creds = {"app_key": self.app_key, + "app_secret": self.app_secret, + "access_token": self.access_token, + "user_id": self.user_id} + + # Save so we don't have to do this again. + with open("credentials.json", "w") as f: + f.write(json.dumps(creds)) + + def _open_file_frame(self, filename, date_cols): + if DROPBOX: + client = dropbox.client.DropboxClient(self.access_token) + f = client.get_file(filename) + else: + f = os.path.normpath(local_dropbox_path + filename) + + if filename[-4:] == ".csv": + df = pd.read_csv(f, parse_dates=date_cols, encoding='utf-8') + else: + df = pd.read_excel(f, parse_dates=date_cols, encoding='utf-8') + return df + + def get_dataframes(self): + """ + This gets the Salesforce and the Deposit dataframes. + Then it does some cleanup of the columns + """ + salesforce_data_name = '/Addgene Shared/Dev/Attribution Report/salesforce_report.xlsx' + salesforce_df = self._open_file_frame(salesforce_data_name, date_cols=[4, 5]) + + if self.footer_length: + length_with_footer = len(salesforce_df.index) + salesforce_df = salesforce_df.head(length_with_footer - self.footer_length) + + deposit_data_name = 'Addgene Shared/Dev/Attribution Report/deposit_data.csv' + deposit_df = self._open_file_frame(deposit_data_name, date_cols=[7, 8]) + + # Clean up Salesforce + salesforce_df['Account Description'].fillna('', inplace=True) + salesforce_df.sort(SF_DATE, ascending=1) + salesforce_df["Full Name"] = salesforce_df["First Name"].map(unicode) + " " + salesforce_df["Last Name"] + del salesforce_df["First Name"] + del salesforce_df["Last Name"] + + # Cleanup Deposit Data + deposit_df['Org Name'].fillna('', inplace=True) + deposit_df.sort(DP_DATE, ascending=1) + deposit_df['PI_Name'].astype(unicode) + + return salesforce_df, deposit_df + + def get_filtered(self, filtered_df, sf_row, pi_name, pi_org, kind): + if kind == "PI": + filter_column = self.PI_COLUMN + filter_value = pi_name + elif kind == "ORG": + filter_column = self.ORG_COLUMN + filter_value = pi_org + + name_match = filtered_df[filtered_df[filter_column] == filter_value] + + output = [] + if not name_match.empty: + for _, row in name_match.iterrows(): + data = { + "Addgene Assigned": sf_row['Assigned'], + "Plasmid ID": row['Plasmid ID'], + "Deposit ID": row['Deposit ID'], + "Institute": row['Org Name'], + "PI Name": row['PI_Name'], + "Date Received": row[DP_DATE], + "Original Date": sf_row[SF_DATE], + "Original ORG": pi_org, + "Original PI": pi_name, + } + output.append(data) + return output + + def get_attribution_dataframes(self): + salesforce, dep = self.get_dataframes() + name_matches = [] + org_matches = [] + + # Iterate through the Salesforce report as the master document + for index, sf_row in salesforce.iterrows(): + # Get a start date and an end date for filtering. + start_date = sf_row[SF_DATE] + end_date = start_date + relativedelta(months=self.months) + + start = dep[DP_DATE].searchsorted(start_date)[0] + end = dep[DP_DATE].searchsorted(end_date)[0] + + # Filter the deposit data to grab only things within that timeframe. + filtered_df = dep.ix[start:end] + + # Variables for short names, and not having to type index a lot. + pi_name = unicode(sf_row['Full Name']) + pi_org = sf_row['Account Name'] + + # Get matches by the PI's name + by_name = self.get_filtered(filtered_df, + sf_row, + pi_name, + pi_org, + kind="PI") + name_matches.extend(by_name) + + # Get matches by the organization name + by_org = self.get_filtered(filtered_df, + sf_row, + pi_name, + pi_org, + kind="ORG") + org_matches.extend(by_org) + return pd.DataFrame(name_matches), pd.DataFrame(org_matches) + + def run(self): + name_df, org_df = self.get_attribution_dataframes() + name_df.to_excelv("names.xls") + org_df.to_excel("orgs.xls") + +if __name__ == '__main__': + report = AttributionReport(credentials_file="credentials.json", + months=6, + footer_length=6) + report.run() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5bd0648 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +dropbox +pandas +xlrd +python-dateutil \ No newline at end of file