Initial commit

2015-07-16 14:13:45 -04:00 · 2015-07-16 14:13:45 -04:00 · 53ee1caeee
commit 53ee1caeee
2 changed files with 174 additions and 0 deletions
--- a/main.py
+++ b/main.py
@ -0,0 +1,170 @@
+import os
+import json
+
+import pandas as pd
+import dropbox
+from dateutil.relativedelta import relativedelta
+
+DROPBOX = False
+local_dropbox_path = "/Users/tyrelsouza/Dropbox (Addgene)/"
+
+SF_DATE = "Date"
+DP_DATE = "Date Received"
+
+
+class AttributionReport(object):
+    def __init__(self, credentials_file, months=6, footer_length=None):
+        self.months = months
+        self.footer_length = footer_length
+        self.PI_COLUMN = "PI_Name"
+        self.ORG_COLUMN = "Org Name"
+
+        with open(credentials_file, "r") as cred_f:
+            creds = json.loads(cred_f.read())
+
+        self.app_key = creds['app_key']
+        self.app_secret = creds['app_secret']
+
+        if not creds.get("access_token", None):
+            self.authorize()
+        else:
+            self.access_token = creds['access_token']
+            self.user_id = creds['user_id']
+
+    def authorize(self):
+        flow = dropbox.client.DropboxOAuth2FlowNoRedirect(self.app_key, self.app_secret)
+        authorize_url = flow.start()
+        print '1. Go to: ' + authorize_url
+        print '2. Click "Allow" (you might have to log in first)'
+        print '3. Copy the authorization code.'
+        code = raw_input("Enter the authorization code here: ").strip()
+        access_token, user_id = flow.finish(code)
+        self.access_token = access_token
+        self.user_id = user_id
+
+        creds = {"app_key": self.app_key,
+                 "app_secret": self.app_secret,
+                 "access_token": self.access_token,
+                 "user_id": self.user_id}
+
+        # Save so we don't have to do this again.
+        with open("credentials.json", "w") as f:
+            f.write(json.dumps(creds))
+
+    def _open_file_frame(self, filename, date_cols):
+        if DROPBOX:
+            client = dropbox.client.DropboxClient(self.access_token)
+            f = client.get_file(filename)
+        else:
+            f = os.path.normpath(local_dropbox_path + filename)
+
+        if filename[-4:] == ".csv":
+            df = pd.read_csv(f, parse_dates=date_cols, encoding='utf-8')
+        else:
+            df = pd.read_excel(f, parse_dates=date_cols, encoding='utf-8')
+        return df
+
+    def get_dataframes(self):
+        """
+            This gets the Salesforce and the Deposit dataframes.
+            Then it does some cleanup of the columns
+        """
+        salesforce_data_name = '/Addgene Shared/Dev/Attribution Report/salesforce_report.xlsx'
+        salesforce_df = self._open_file_frame(salesforce_data_name, date_cols=[4, 5])
+
+        if self.footer_length:
+            length_with_footer = len(salesforce_df.index)
+            salesforce_df = salesforce_df.head(length_with_footer - self.footer_length)
+
+        deposit_data_name = 'Addgene Shared/Dev/Attribution Report/deposit_data.csv'
+        deposit_df = self._open_file_frame(deposit_data_name, date_cols=[7, 8])
+
+        # Clean up Salesforce
+        salesforce_df['Account Description'].fillna('', inplace=True)
+        salesforce_df.sort(SF_DATE, ascending=1)
+        salesforce_df["Full Name"] = salesforce_df["First Name"].map(unicode) + " " + salesforce_df["Last Name"]
+        del salesforce_df["First Name"]
+        del salesforce_df["Last Name"]
+
+        # Cleanup Deposit Data
+        deposit_df['Org Name'].fillna('', inplace=True)
+        deposit_df.sort(DP_DATE, ascending=1)
+        deposit_df['PI_Name'].astype(unicode)
+
+        return salesforce_df, deposit_df
+
+    def get_filtered(self, filtered_df, sf_row, pi_name, pi_org, kind):
+        if kind == "PI":
+            filter_column = self.PI_COLUMN
+            filter_value = pi_name
+        elif kind == "ORG":
+            filter_column = self.ORG_COLUMN
+            filter_value = pi_org
+
+        name_match = filtered_df[filtered_df[filter_column] == filter_value]
+
+        output = []
+        if not name_match.empty:
+            for _, row in name_match.iterrows():
+                data = {
+                    "Addgene Assigned": sf_row['Assigned'],
+                    "Plasmid ID": row['Plasmid ID'],
+                    "Deposit ID": row['Deposit ID'],
+                    "Institute": row['Org Name'],
+                    "PI Name": row['PI_Name'],
+                    "Date Received": row[DP_DATE],
+                    "Original Date": sf_row[SF_DATE],
+                    "Original ORG": pi_org,
+                    "Original PI": pi_name,
+                }
+                output.append(data)
+        return output
+
+    def get_attribution_dataframes(self):
+        salesforce, dep = self.get_dataframes()
+        name_matches = []
+        org_matches = []
+
+        # Iterate through the Salesforce report as the master document
+        for index, sf_row in salesforce.iterrows():
+            # Get a start date and an end date for filtering.
+            start_date = sf_row[SF_DATE]
+            end_date = start_date + relativedelta(months=self.months)
+
+            start = dep[DP_DATE].searchsorted(start_date)[0]
+            end = dep[DP_DATE].searchsorted(end_date)[0]
+
+            # Filter the deposit data to grab only things within that timeframe.
+            filtered_df = dep.ix[start:end]
+
+            # Variables for short names, and not having to type index a lot.
+            pi_name = unicode(sf_row['Full Name'])
+            pi_org = sf_row['Account Name']
+
+            # Get matches by the PI's name
+            by_name = self.get_filtered(filtered_df,
+                                        sf_row,
+                                        pi_name,
+                                        pi_org,
+                                        kind="PI")
+            name_matches.extend(by_name)
+
+            # Get matches by the organization name
+            by_org = self.get_filtered(filtered_df,
+                                       sf_row,
+                                       pi_name,
+                                       pi_org,
+                                       kind="ORG")
+            org_matches.extend(by_org)
+        return pd.DataFrame(name_matches), pd.DataFrame(org_matches)
+
+    def run(self):
+        name_df, org_df = self.get_attribution_dataframes()
+        name_df.to_excelv("names.xls")
+        org_df.to_excel("orgs.xls")
+
+if __name__ == '__main__':
+    report = AttributionReport(credentials_file="credentials.json",
+                               months=6,
+                               footer_length=6)
+    report.run()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,4 @@
+dropbox
+pandas
+xlrd
+python-dateutil