Cleanup, rename, move functions around, context managers

store package versions reqs added encoding compiled renamed Reveal file when done Remove reveal temp dont do big files dont store full name, use first and last later added gu8i minsize cleanup Gui, utils, etc cleanup from pyinstaller gui done for now filetypes fix Progress bar, multithreading GUI Chnages, progress bars, more error handling. Add distribution gitignore add make mac Added spec file Make Win fix ValueError bug rebuild mac Windows EXE mac app rename Readme added readme updates, and example on attribution.py delete setup remove prints remove threading remove requirement
cleanup
2022-08-18 17:01:20 -04:00 · 2015-07-16 16:24:08 -04:00 · 2015-07-16 16:21:22 -04:00 · 2015-07-16 15:10:17 -04:00
5 changed files with 253 additions and 174 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+*.pyc
+.idea
+build*
--- a/attribution.py
+++ b/attribution.py
@ -0,0 +1,221 @@
+# -*- coding: utf-8 -*-
+import os
+import datetime
+import logging
+
+import pandas as pd
+from dateutil.relativedelta import relativedelta
+
+L = logging.getLogger(__name__)
+
+
+class AttributionReport(object):
+    def __init__(self, months=6, footer_length=None):
+        self.months = months
+        self.footer_length = footer_length
+
+        self.SF_DATE_COLUMN = "Date"
+        self.DP_DATE_COLUMN = "Date Received"
+        self.PI_COLUMN = "PI_Name"
+        self.ORG_COLUMN = "Org Name"
+        # Output the XLSX in this order
+
+        self.OUTPUT_COLUMN_ORDER = ["Addgene Assigned", "Plasmid ID", "Deposit ID", "Institute", "PI Name",
+                                    "Date Received", "Original Date", "Original ORG", "Original PI"]
+
+        self.ACCEPTABLE_EXTENSIONS = ["*.csv", "*.xls", "*.xlsx"]
+
+        # columns that need to be in the files
+        self.REQUIRED_SF_COLUMNS = ["First Name", "Last Name", "Account Name", "Date", "Assigned"]
+        self.REQUIRED_DP_COLUMNS = ["Org Name", "Deposit ID", "Plasmid ID", "PI_Name", "Date Received"]
+
+        # After load and merging, delete these columns
+        self.SF_TRIM_COLUMNS = ["Subject", "Created Date", "LIMS Organization ID",
+                                "Account Description"]
+        self.DP_TRIM_COLUMNS = ["Org ID", "Deposit Status", "PI_ID", "Date Available", "# Orders",
+                                "# Plasmids in the Deposit", "Addgene Contact", "Country"]
+
+        self.salesforce_df = None
+        self.deposit_df = None
+        self.output_dir = None
+        self.frames = None
+
+    def _get_dataframe_by_extension(self, path, date_cols):
+        """
+            Gets a dataframe either by .csv, or .xls(x),
+            or erroring and exiting.
+        """
+        _, ext = os.path.splitext(path)
+
+        if ext == ".csv":
+            df = pd.read_csv(path, parse_dates=date_cols, encoding='utf-8')
+        elif ext in [".xlsx", ".xls"]:
+            df = pd.read_excel(path, parse_dates=date_cols, encoding='utf-8')
+        else:
+            raise Exception("File was not of type {0}.\nQuitting".format(
+                " ".join(self.ACCEPTABLE_EXTENSIONS)))
+        return df
+
+    def set_dataframe_sf(self, fname):
+        self.salesforce_df = None
+        try:
+            salesforce_df = self._get_dataframe_by_extension(fname, date_cols=[self.SF_DATE_COLUMN, ])
+        except IndexError:
+            return False
+        except ValueError:
+            return False
+        except:
+            raise
+
+        if set(self.REQUIRED_SF_COLUMNS) < set(salesforce_df.columns):
+            self.salesforce_df = salesforce_df
+            return True
+        L.info("Wrong columns")
+        return False
+
+    def set_dataframe_deposit(self, fname):
+        self.deposit_df = None
+        try:
+            deposit_df = self._get_dataframe_by_extension(fname, date_cols=[self.DP_DATE_COLUMN, ])
+        except IndexError:
+            return False
+        except ValueError:
+            return False
+        except:
+            raise
+        if set(self.REQUIRED_DP_COLUMNS) < set(deposit_df.columns):
+            self.deposit_df = deposit_df
+            return True
+        L.info("Wrong columns")
+        return False
+
+    def set_output_dir(self, dir):
+        self.output_dir = dir
+
+    def get_dataframes(self):
+        salesforce_df, deposit_df = self.clean_dataframes()
+        return salesforce_df, deposit_df
+
+    def clean_dataframes(self):
+        # Get rid of the footer that Salesforce adds.
+        if self.footer_length:
+            length_with_footer = len(self.salesforce_df.index)
+            self.salesforce_df = self.salesforce_df.head(length_with_footer - self.footer_length)
+
+        # Clean up Salesforce
+        self.salesforce_df.sort(self.SF_DATE_COLUMN, ascending=1)
+
+        # Cleanup Deposit Data
+        self.deposit_df['Org Name'].fillna('', inplace=True)
+        self.deposit_df.sort(self.DP_DATE_COLUMN, ascending=1)
+        self.deposit_df['PI_Name'].astype(unicode)
+
+        # Cleanup not needed columns
+        for col in self.SF_TRIM_COLUMNS:
+            del self.salesforce_df[col]
+        for col in self.DP_TRIM_COLUMNS:
+            del self.deposit_df[col]
+
+    def get_filtered(self, filtered_df, sf_row, pi_name, pi_org, org=False):
+        """
+            Assume kind is PI by default.
+            Filter where either the PI and PI match, or the Org and Org match
+            If both match, add it to the the double list
+            if only one matches, add it to the single list.
+        """
+        filter_column = self.PI_COLUMN
+        filter_value = pi_name
+        single, double = [], []
+
+        if org:
+            filter_column = self.ORG_COLUMN
+            filter_value = pi_org
+
+        name_match = filtered_df[filtered_df[filter_column] == filter_value]
+
+        if not name_match.empty:
+            for _, row in name_match.iterrows():
+                data = {
+                    "Addgene Assigned": sf_row['Assigned'],
+                    "Plasmid ID": row['Plasmid ID'],
+                    "Deposit ID": row['Deposit ID'],
+                    "Institute": row['Org Name'],
+                    "PI Name": row['PI_Name'],
+                    "Date Received": row[self.DP_DATE_COLUMN],
+                    "Original Date": sf_row[self.SF_DATE_COLUMN],
+                    "Original ORG": pi_org,
+                    "Original PI": pi_name,
+                }
+                if (data['Institute'] == data['Original ORG']) and \
+                        (data['PI Name'] == data['Original PI']):
+                    double.append(data)
+                else:
+                    single.append(data)
+        return single, double
+
+    def get_attribution_dataframes(self):
+        self.clean_dataframes()
+
+        name_matches = []
+        org_matches = []
+        double_matches = []
+        mismatches = []
+
+        # Iterate through the Salesforce report as the master document
+        for index, sf_row in self.salesforce_df.iterrows():
+            # Get a start date and an end date for filtering.
+            start_date = sf_row[self.SF_DATE_COLUMN]
+            end_date = start_date + relativedelta(months=self.months)
+
+            start = self.deposit_df[self.DP_DATE_COLUMN].searchsorted(start_date)[0]
+            end = self.deposit_df[self.DP_DATE_COLUMN].searchsorted(end_date)[0]
+
+            # Filter the deposit data to grab only things within that timeframe.
+            filtered_df = self.deposit_df.ix[start:end]
+
+            # Variables for short names, and not having to type index a lot.
+            pi_name = unicode(sf_row['First Name']) + " " + unicode(sf_row['Last Name'])
+            pi_org = sf_row['Account Name']
+
+            # Get matches by the PI's name
+            by_name, pi_by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org)
+            name_matches.extend(by_name)
+            mismatches.extend(by_name)
+            double_matches.extend(pi_by_both)
+
+            # Get matches by the organization name
+            by_org, org_by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org, org=True)
+            org_matches.extend(by_org)
+            mismatches.extend(by_org)
+            double_matches.extend(org_by_both)
+
+        return (
+            ("PI", pd.DataFrame(name_matches, columns=self.OUTPUT_COLUMN_ORDER)),
+            ("Institute", pd.DataFrame(org_matches, columns=self.OUTPUT_COLUMN_ORDER)),
+            ("Double", pd.DataFrame(double_matches, columns=self.OUTPUT_COLUMN_ORDER)),
+            ("Single", pd.DataFrame(mismatches, columns=self.OUTPUT_COLUMN_ORDER))
+        )
+
+    def run(self):
+        self.frames = None
+        self.frames = self.get_attribution_dataframes()
+
+    def save(self):
+        for key, df in self.frames:
+            fname = '{0}_Attribution_Report_{1}_Match.xlsx'.format(datetime.date.today(), key)
+
+            output_path = os.path.join(self.output_dir, fname)
+
+            deduped_df = df.drop_duplicates()
+
+            with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
+                deduped_df.to_excel(writer, sheet_name='Sheet1', index=False)
+
+if __name__ == '__main__':
+    app = AttributionReport(months=6, footer_length=6)
+    app.set_dataframe_deposit("/Users/tyrelsouza/Dropbox (Addgene)/Addgene Shared/Dev/Attribution Report/deposit_data.csv")
+    app.set_dataframe_sf("/Users/tyrelsouza/Dropbox (Addgene)/Addgene Shared/Dev/Attribution Report/salesforce_report.xlsx")
+    app.set_output_dir("/Users/tyrelsouza/Dropbox (Addgene)/Addgene Shared/Dev/Attribution Report/Output/")
+    app.run()
+    app.save()
+
--- a/main.py
+++ b/main.py
@ -1,170 +0,0 @@
-import os
-import json
-
-import pandas as pd
-import dropbox
-from dateutil.relativedelta import relativedelta
-
-DROPBOX = False
-local_dropbox_path = "/Users/tyrelsouza/Dropbox (Addgene)/"
-
-SF_DATE = "Date"
-DP_DATE = "Date Received"
-
-
-class AttributionReport(object):
-    def __init__(self, credentials_file, months=6, footer_length=None):
-        self.months = months
-        self.footer_length = footer_length
-        self.PI_COLUMN = "PI_Name"
-        self.ORG_COLUMN = "Org Name"
-
-        with open(credentials_file, "r") as cred_f:
-            creds = json.loads(cred_f.read())
-
-        self.app_key = creds['app_key']
-        self.app_secret = creds['app_secret']
-
-        if not creds.get("access_token", None):
-            self.authorize()
-        else:
-            self.access_token = creds['access_token']
-            self.user_id = creds['user_id']
-
-    def authorize(self):
-        flow = dropbox.client.DropboxOAuth2FlowNoRedirect(self.app_key, self.app_secret)
-        authorize_url = flow.start()
-        print '1. Go to: ' + authorize_url
-        print '2. Click "Allow" (you might have to log in first)'
-        print '3. Copy the authorization code.'
-        code = raw_input("Enter the authorization code here: ").strip()
-        access_token, user_id = flow.finish(code)
-        self.access_token = access_token
-        self.user_id = user_id
-
-        creds = {"app_key": self.app_key,
-                 "app_secret": self.app_secret,
-                 "access_token": self.access_token,
-                 "user_id": self.user_id}
-
-        # Save so we don't have to do this again.
-        with open("credentials.json", "w") as f:
-            f.write(json.dumps(creds))
-
-    def _open_file_frame(self, filename, date_cols):
-        if DROPBOX:
-            client = dropbox.client.DropboxClient(self.access_token)
-            f = client.get_file(filename)
-        else:
-            f = os.path.normpath(local_dropbox_path + filename)
-
-        if filename[-4:] == ".csv":
-            df = pd.read_csv(f, parse_dates=date_cols, encoding='utf-8')
-        else:
-            df = pd.read_excel(f, parse_dates=date_cols, encoding='utf-8')
-        return df
-
-    def get_dataframes(self):
-        """
-            This gets the Salesforce and the Deposit dataframes.
-            Then it does some cleanup of the columns
-        """
-        salesforce_data_name = '/Addgene Shared/Dev/Attribution Report/salesforce_report.xlsx'
-        salesforce_df = self._open_file_frame(salesforce_data_name, date_cols=[4, 5])
-
-        if self.footer_length:
-            length_with_footer = len(salesforce_df.index)
-            salesforce_df = salesforce_df.head(length_with_footer - self.footer_length)
-
-        deposit_data_name = 'Addgene Shared/Dev/Attribution Report/deposit_data.csv'
-        deposit_df = self._open_file_frame(deposit_data_name, date_cols=[7, 8])
-
-        # Clean up Salesforce
-        salesforce_df['Account Description'].fillna('', inplace=True)
-        salesforce_df.sort(SF_DATE, ascending=1)
-        salesforce_df["Full Name"] = salesforce_df["First Name"].map(unicode) + " " + salesforce_df["Last Name"]
-        del salesforce_df["First Name"]
-        del salesforce_df["Last Name"]
-
-        # Cleanup Deposit Data
-        deposit_df['Org Name'].fillna('', inplace=True)
-        deposit_df.sort(DP_DATE, ascending=1)
-        deposit_df['PI_Name'].astype(unicode)
-
-        return salesforce_df, deposit_df
-
-    def get_filtered(self, filtered_df, sf_row, pi_name, pi_org, kind):
-        if kind == "PI":
-            filter_column = self.PI_COLUMN
-            filter_value = pi_name
-        elif kind == "ORG":
-            filter_column = self.ORG_COLUMN
-            filter_value = pi_org
-
-        name_match = filtered_df[filtered_df[filter_column] == filter_value]
-
-        output = []
-        if not name_match.empty:
-            for _, row in name_match.iterrows():
-                data = {
-                    "Addgene Assigned": sf_row['Assigned'],
-                    "Plasmid ID": row['Plasmid ID'],
-                    "Deposit ID": row['Deposit ID'],
-                    "Institute": row['Org Name'],
-                    "PI Name": row['PI_Name'],
-                    "Date Received": row[DP_DATE],
-                    "Original Date": sf_row[SF_DATE],
-                    "Original ORG": pi_org,
-                    "Original PI": pi_name,
-                }
-                output.append(data)
-        return output
-
-    def get_attribution_dataframes(self):
-        salesforce, dep = self.get_dataframes()
-        name_matches = []
-        org_matches = []
-
-        # Iterate through the Salesforce report as the master document
-        for index, sf_row in salesforce.iterrows():
-            # Get a start date and an end date for filtering.
-            start_date = sf_row[SF_DATE]
-            end_date = start_date + relativedelta(months=self.months)
-
-            start = dep[DP_DATE].searchsorted(start_date)[0]
-            end = dep[DP_DATE].searchsorted(end_date)[0]
-
-            # Filter the deposit data to grab only things within that timeframe.
-            filtered_df = dep.ix[start:end]
-
-            # Variables for short names, and not having to type index a lot.
-            pi_name = unicode(sf_row['Full Name'])
-            pi_org = sf_row['Account Name']
-
-            # Get matches by the PI's name
-            by_name = self.get_filtered(filtered_df,
-                                        sf_row,
-                                        pi_name,
-                                        pi_org,
-                                        kind="PI")
-            name_matches.extend(by_name)
-
-            # Get matches by the organization name
-            by_org = self.get_filtered(filtered_df,
-                                       sf_row,
-                                       pi_name,
-                                       pi_org,
-                                       kind="ORG")
-            org_matches.extend(by_org)
-        return pd.DataFrame(name_matches), pd.DataFrame(org_matches)
-
-    def run(self):
-        name_df, org_df = self.get_attribution_dataframes()
-        name_df.to_excelv("names.xls")
-        org_df.to_excel("orgs.xls")
-
-if __name__ == '__main__':
-    report = AttributionReport(credentials_file="credentials.json",
-                               months=6,
-                               footer_length=6)
-    report.run()
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,4 @@
-dropbox
-pandas
-xlrd
-python-dateutil
+XlsxWriter==0.7.3
+pandas==0.16.2
+python-dateutil==2.4.2
+xlrd==0.9.4
--- a/utils.py
+++ b/utils.py
@ -0,0 +1,25 @@
+__author__ = 'tyrelsouza'
+import os
+import json
+
+
+def get_dropbox_dir():
+    """
+        Windows and Mac get dropox dir for Business or fallback to personal
+    """
+    if os.name == "nt":
+        dropbox_file = os.path.join(os.getenv('APPDATA'), 'Dropbox', 'info.json')
+    else:
+        dropbox_file = os.path.expanduser("~/.dropbox/info.json")
+
+    with open(dropbox_file) as dbf:
+        dbconfig = json.loads(dbf.read())
+
+    if "business" in dbconfig:
+        dropbox_dir = dbconfig['business']['path']
+    elif "personal" in dbconfig:
+        dropbox_dir = dbconfig['personal']['path']
+    else:
+        dropbox_dir = os.path.expanduser("~")
+
+    return dropbox_dir
Author	SHA1	Message	Date
Tyrel Souza	e3d6952517	Cleanup, rename, move functions around, context managers store package versions reqs added encoding compiled renamed Reveal file when done Remove reveal temp dont do big files dont store full name, use first and last later added gu8i minsize cleanup Gui, utils, etc cleanup from pyinstaller gui done for now filetypes fix Progress bar, multithreading GUI Chnages, progress bars, more error handling. Add distribution gitignore add make mac Added spec file Make Win fix ValueError bug rebuild mac Windows EXE mac app rename Readme added readme updates, and example on attribution.py delete setup remove prints remove threading remove requirement	2022-08-18 17:01:20 -04:00
Tyrel Souza	408e2a8de5	cleanup	2015-07-16 16:24:08 -04:00
Tyrel Souza	66b5fd6b31	EasyGui interface	2015-07-16 16:21:22 -04:00
Tyrel Souza	0ca5217ddc	before gui	2015-07-16 15:10:17 -04:00