before gui

EasyGui interface cleanup Cleanup, rename, move functions around, context managers store package versions reqs added encoding compiled renamed Reveal file when done Remove reveal temp dont do big files dont store full name, use first and last later added gu8i minsize cleanup Gui, utils, etc cleanup from pyinstaller gui done for now filetypes fix Progress bar, multithreading GUI Chnages, progress bars, more error handling. Add distribution gitignore add make mac Added spec file Make Win fix ValueError bug rebuild mac Windows EXE mac app rename Readme added readme updates, and example on attribution.py delete setup remove prints remove threading remove requirement
2022-08-18 17:01:39 -04:00
5 changed files with 253 additions and 174 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 *.pyc
 .idea
 build*
--- a/attribution.py
+++ b/attribution.py
@ -0,0 +1,221 @@
 # -*- coding: utf-8 -*-
 import os
 import datetime
 import logging
 import pandas as pd
 from dateutil.relativedelta import relativedelta
 L = logging.getLogger(__name__)
 class AttributionReport(object):
    def __init__(self, months=6, footer_length=None):
        self.months = months
        self.footer_length = footer_length
        self.SF_DATE_COLUMN = "Date"
        self.DP_DATE_COLUMN = "Date Received"
        self.PI_COLUMN = "PI_Name"
        self.ORG_COLUMN = "Org Name"
        # Output the XLSX in this order
        self.OUTPUT_COLUMN_ORDER = ["Addgene Assigned", "Plasmid ID", "Deposit ID", "Institute", "PI Name",
                                    "Date Received", "Original Date", "Original ORG", "Original PI"]
        self.ACCEPTABLE_EXTENSIONS = ["*.csv", "*.xls", "*.xlsx"]
        # columns that need to be in the files
        self.REQUIRED_SF_COLUMNS = ["First Name", "Last Name", "Account Name", "Date", "Assigned"]
        self.REQUIRED_DP_COLUMNS = ["Org Name", "Deposit ID", "Plasmid ID", "PI_Name", "Date Received"]
        # After load and merging, delete these columns
        self.SF_TRIM_COLUMNS = ["Subject", "Created Date", "LIMS Organization ID",
                                "Account Description"]
        self.DP_TRIM_COLUMNS = ["Org ID", "Deposit Status", "PI_ID", "Date Available", "# Orders",
                                "# Plasmids in the Deposit", "Addgene Contact", "Country"]
        self.salesforce_df = None
        self.deposit_df = None
        self.output_dir = None
        self.frames = None
    def _get_dataframe_by_extension(self, path, date_cols):
        """
            Gets a dataframe either by .csv, or .xls(x),
            or erroring and exiting.
        """
        _, ext = os.path.splitext(path)
        if ext == ".csv":
            df = pd.read_csv(path, parse_dates=date_cols, encoding='utf-8')
        elif ext in [".xlsx", ".xls"]:
            df = pd.read_excel(path, parse_dates=date_cols, encoding='utf-8')
        else:
            raise Exception("File was not of type {0}.\nQuitting".format(
                " ".join(self.ACCEPTABLE_EXTENSIONS)))
        return df
    def set_dataframe_sf(self, fname):
        self.salesforce_df = None
        try:
            salesforce_df = self._get_dataframe_by_extension(fname, date_cols=[self.SF_DATE_COLUMN, ])
        except IndexError:
            return False
        except ValueError:
            return False
        except:
            raise
        if set(self.REQUIRED_SF_COLUMNS) < set(salesforce_df.columns):
            self.salesforce_df = salesforce_df
            return True
        L.info("Wrong columns")
        return False
    def set_dataframe_deposit(self, fname):
        self.deposit_df = None
        try:
            deposit_df = self._get_dataframe_by_extension(fname, date_cols=[self.DP_DATE_COLUMN, ])
        except IndexError:
            return False
        except ValueError:
            return False
        except:
            raise
        if set(self.REQUIRED_DP_COLUMNS) < set(deposit_df.columns):
            self.deposit_df = deposit_df
            return True
        L.info("Wrong columns")
        return False
    def set_output_dir(self, dir):
        self.output_dir = dir
    def get_dataframes(self):
        salesforce_df, deposit_df = self.clean_dataframes()
        return salesforce_df, deposit_df
    def clean_dataframes(self):
        # Get rid of the footer that Salesforce adds.
        if self.footer_length:
            length_with_footer = len(self.salesforce_df.index)
            self.salesforce_df = self.salesforce_df.head(length_with_footer - self.footer_length)
        # Clean up Salesforce
        self.salesforce_df.sort(self.SF_DATE_COLUMN, ascending=1)
        # Cleanup Deposit Data
        self.deposit_df['Org Name'].fillna('', inplace=True)
        self.deposit_df.sort(self.DP_DATE_COLUMN, ascending=1)
        self.deposit_df['PI_Name'].astype(unicode)
        # Cleanup not needed columns
        for col in self.SF_TRIM_COLUMNS:
            del self.salesforce_df[col]
        for col in self.DP_TRIM_COLUMNS:
            del self.deposit_df[col]
    def get_filtered(self, filtered_df, sf_row, pi_name, pi_org, org=False):
        """
            Assume kind is PI by default.
            Filter where either the PI and PI match, or the Org and Org match
            If both match, add it to the the double list
            if only one matches, add it to the single list.
        """
        filter_column = self.PI_COLUMN
        filter_value = pi_name
        single, double = [], []
        if org:
            filter_column = self.ORG_COLUMN
            filter_value = pi_org
        name_match = filtered_df[filtered_df[filter_column] == filter_value]
        if not name_match.empty:
            for _, row in name_match.iterrows():
                data = {
                    "Addgene Assigned": sf_row['Assigned'],
                    "Plasmid ID": row['Plasmid ID'],
                    "Deposit ID": row['Deposit ID'],
                    "Institute": row['Org Name'],
                    "PI Name": row['PI_Name'],
                    "Date Received": row[self.DP_DATE_COLUMN],
                    "Original Date": sf_row[self.SF_DATE_COLUMN],
                    "Original ORG": pi_org,
                    "Original PI": pi_name,
                }
                if (data['Institute'] == data['Original ORG']) and \
                        (data['PI Name'] == data['Original PI']):
                    double.append(data)
                else:
                    single.append(data)
        return single, double
    def get_attribution_dataframes(self):
        self.clean_dataframes()
        name_matches = []
        org_matches = []
        double_matches = []
        mismatches = []
        # Iterate through the Salesforce report as the master document
        for index, sf_row in self.salesforce_df.iterrows():
            # Get a start date and an end date for filtering.
            start_date = sf_row[self.SF_DATE_COLUMN]
            end_date = start_date + relativedelta(months=self.months)
            start = self.deposit_df[self.DP_DATE_COLUMN].searchsorted(start_date)[0]
            end = self.deposit_df[self.DP_DATE_COLUMN].searchsorted(end_date)[0]
            # Filter the deposit data to grab only things within that timeframe.
            filtered_df = self.deposit_df.ix[start:end]
            # Variables for short names, and not having to type index a lot.
            pi_name = unicode(sf_row['First Name']) + " " + unicode(sf_row['Last Name'])
            pi_org = sf_row['Account Name']
            # Get matches by the PI's name
            by_name, pi_by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org)
            name_matches.extend(by_name)
            mismatches.extend(by_name)
            double_matches.extend(pi_by_both)
            # Get matches by the organization name
            by_org, org_by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org, org=True)
            org_matches.extend(by_org)
            mismatches.extend(by_org)
            double_matches.extend(org_by_both)
        return (
            ("PI", pd.DataFrame(name_matches, columns=self.OUTPUT_COLUMN_ORDER)),
            ("Institute", pd.DataFrame(org_matches, columns=self.OUTPUT_COLUMN_ORDER)),
            ("Double", pd.DataFrame(double_matches, columns=self.OUTPUT_COLUMN_ORDER)),
            ("Single", pd.DataFrame(mismatches, columns=self.OUTPUT_COLUMN_ORDER))
        )
    def run(self):
        self.frames = None
        self.frames = self.get_attribution_dataframes()
    def save(self):
        for key, df in self.frames:
            fname = '{0}_Attribution_Report_{1}_Match.xlsx'.format(datetime.date.today(), key)
            output_path = os.path.join(self.output_dir, fname)
            deduped_df = df.drop_duplicates()
            with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
                deduped_df.to_excel(writer, sheet_name='Sheet1', index=False)
 if __name__ == '__main__':
    app = AttributionReport(months=6, footer_length=6)
    app.set_dataframe_deposit("/Users/tyrelsouza/Dropbox (Addgene)/Addgene Shared/Dev/Attribution Report/deposit_data.csv")
    app.set_dataframe_sf("/Users/tyrelsouza/Dropbox (Addgene)/Addgene Shared/Dev/Attribution Report/salesforce_report.xlsx")
    app.set_output_dir("/Users/tyrelsouza/Dropbox (Addgene)/Addgene Shared/Dev/Attribution Report/Output/")
    app.run()
    app.save()
--- a/main.py
+++ b/main.py
@ -1,170 +0,0 @@
 import os
 import json
 import pandas as pd
 import dropbox
 from dateutil.relativedelta import relativedelta
 DROPBOX = False
 local_dropbox_path = "/Users/tyrelsouza/Dropbox (Addgene)/"
 SF_DATE = "Date"
 DP_DATE = "Date Received"
 class AttributionReport(object):
    def __init__(self, credentials_file, months=6, footer_length=None):
        self.months = months
        self.footer_length = footer_length
        self.PI_COLUMN = "PI_Name"
        self.ORG_COLUMN = "Org Name"
        with open(credentials_file, "r") as cred_f:
            creds = json.loads(cred_f.read())
        self.app_key = creds['app_key']
        self.app_secret = creds['app_secret']
        if not creds.get("access_token", None):
            self.authorize()
        else:
            self.access_token = creds['access_token']
            self.user_id = creds['user_id']
    def authorize(self):
        flow = dropbox.client.DropboxOAuth2FlowNoRedirect(self.app_key, self.app_secret)
        authorize_url = flow.start()
        print '1. Go to: ' + authorize_url
        print '2. Click "Allow" (you might have to log in first)'
        print '3. Copy the authorization code.'
        code = raw_input("Enter the authorization code here: ").strip()
        access_token, user_id = flow.finish(code)
        self.access_token = access_token
        self.user_id = user_id
        creds = {"app_key": self.app_key,
                 "app_secret": self.app_secret,
                 "access_token": self.access_token,
                 "user_id": self.user_id}
        # Save so we don't have to do this again.
        with open("credentials.json", "w") as f:
            f.write(json.dumps(creds))
    def _open_file_frame(self, filename, date_cols):
        if DROPBOX:
            client = dropbox.client.DropboxClient(self.access_token)
            f = client.get_file(filename)
        else:
            f = os.path.normpath(local_dropbox_path + filename)
        if filename[-4:] == ".csv":
            df = pd.read_csv(f, parse_dates=date_cols, encoding='utf-8')
        else:
            df = pd.read_excel(f, parse_dates=date_cols, encoding='utf-8')
        return df
    def get_dataframes(self):
        """
            This gets the Salesforce and the Deposit dataframes.
            Then it does some cleanup of the columns
        """
        salesforce_data_name = '/Addgene Shared/Dev/Attribution Report/salesforce_report.xlsx'
        salesforce_df = self._open_file_frame(salesforce_data_name, date_cols=[4, 5])
        if self.footer_length:
            length_with_footer = len(salesforce_df.index)
            salesforce_df = salesforce_df.head(length_with_footer - self.footer_length)
        deposit_data_name = 'Addgene Shared/Dev/Attribution Report/deposit_data.csv'
        deposit_df = self._open_file_frame(deposit_data_name, date_cols=[7, 8])
        # Clean up Salesforce
        salesforce_df['Account Description'].fillna('', inplace=True)
        salesforce_df.sort(SF_DATE, ascending=1)
        salesforce_df["Full Name"] = salesforce_df["First Name"].map(unicode) + " " + salesforce_df["Last Name"]
        del salesforce_df["First Name"]
        del salesforce_df["Last Name"]
        # Cleanup Deposit Data
        deposit_df['Org Name'].fillna('', inplace=True)
        deposit_df.sort(DP_DATE, ascending=1)
        deposit_df['PI_Name'].astype(unicode)
        return salesforce_df, deposit_df
    def get_filtered(self, filtered_df, sf_row, pi_name, pi_org, kind):
        if kind == "PI":
            filter_column = self.PI_COLUMN
            filter_value = pi_name
        elif kind == "ORG":
            filter_column = self.ORG_COLUMN
            filter_value = pi_org
        name_match = filtered_df[filtered_df[filter_column] == filter_value]
        output = []
        if not name_match.empty:
            for _, row in name_match.iterrows():
                data = {
                    "Addgene Assigned": sf_row['Assigned'],
                    "Plasmid ID": row['Plasmid ID'],
                    "Deposit ID": row['Deposit ID'],
                    "Institute": row['Org Name'],
                    "PI Name": row['PI_Name'],
                    "Date Received": row[DP_DATE],
                    "Original Date": sf_row[SF_DATE],
                    "Original ORG": pi_org,
                    "Original PI": pi_name,
                }
                output.append(data)
        return output
    def get_attribution_dataframes(self):
        salesforce, dep = self.get_dataframes()
        name_matches = []
        org_matches = []
        # Iterate through the Salesforce report as the master document
        for index, sf_row in salesforce.iterrows():
            # Get a start date and an end date for filtering.
            start_date = sf_row[SF_DATE]
            end_date = start_date + relativedelta(months=self.months)
            start = dep[DP_DATE].searchsorted(start_date)[0]
            end = dep[DP_DATE].searchsorted(end_date)[0]
            # Filter the deposit data to grab only things within that timeframe.
            filtered_df = dep.ix[start:end]
            # Variables for short names, and not having to type index a lot.
            pi_name = unicode(sf_row['Full Name'])
            pi_org = sf_row['Account Name']
            # Get matches by the PI's name
            by_name = self.get_filtered(filtered_df,
                                        sf_row,
                                        pi_name,
                                        pi_org,
                                        kind="PI")
            name_matches.extend(by_name)
            # Get matches by the organization name
            by_org = self.get_filtered(filtered_df,
                                       sf_row,
                                       pi_name,
                                       pi_org,
                                       kind="ORG")
            org_matches.extend(by_org)
        return pd.DataFrame(name_matches), pd.DataFrame(org_matches)
    def run(self):
        name_df, org_df = self.get_attribution_dataframes()
        name_df.to_excelv("names.xls")
        org_df.to_excel("orgs.xls")
 if __name__ == '__main__':
    report = AttributionReport(credentials_file="credentials.json",
                               months=6,
                               footer_length=6)
    report.run()
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,4 @@
-dropbox
+XlsxWriter==0.7.3
-pandas
+pandas==0.16.2
-xlrd
+python-dateutil==2.4.2
-python-dateutil
+xlrd==0.9.4
--- a/utils.py
+++ b/utils.py
@ -0,0 +1,25 @@
 __author__ = 'tyrelsouza'
 import os
 import json
 def get_dropbox_dir():
    """
        Windows and Mac get dropox dir for Business or fallback to personal
    """
    if os.name == "nt":
        dropbox_file = os.path.join(os.getenv('APPDATA'), 'Dropbox', 'info.json')
    else:
        dropbox_file = os.path.expanduser("~/.dropbox/info.json")
    with open(dropbox_file) as dbf:
        dbconfig = json.loads(dbf.read())
    if "business" in dbconfig:
        dropbox_dir = dbconfig['business']['path']
    elif "personal" in dbconfig:
        dropbox_dir = dbconfig['personal']['path']
    else:
        dropbox_dir = os.path.expanduser("~")
    return dropbox_dir