attribution_report/main.py

import os
import json
import datetime
import logging
import sys

import pandas as pd
from dateutil.relativedelta import relativedelta
import easygui

L = logging.getLogger(__name__)


class AttributionReport(object):
    def __init__(self, credentials_file, months=6, footer_length=None):
        self.months = months
        self.footer_length = footer_length

        self.SF_DATE_COLUMN = "Date"
        self.DP_DATE_COLUMN = "Date Received"
        self.PI_COLUMN = "PI_Name"
        self.ORG_COLUMN = "Org Name"
        # Output the XLSX in this order

        self.OUTPUT_COLUMN_ORDER = ["Addgene Assigned", "Plasmid ID", "Deposit ID", "Institute", "PI Name",
                                    "Date Received", "Original Date", "Original ORG", "Original PI"]

        self.ACCEPTABLE_EXTENSIONS = ["*.csv", "*.xls", "*.xlsx"]

        # columns that need to be in the files
        self.REQUIRED_SF_COLUMNS = ["First Name", "Last Name", "Account Name", "Date", "Assigned"]
        self.REQUIRED_DP_COLUMNS = ["Org Name", "Deposit ID", "Plasmid ID", "PI_Name", "Date Received"]

        # After load and merging, delete these columns
        self.SF_TRIM_COLUMNS = ["Subject", "First Name", "Last Name", "Created Date", "LIMS Organization ID",
                                "Account Description"]
        self.DP_TRIM_COLUMNS = ["Org ID", "Deposit Status", "PI_ID", "Date Available", "# Orders",
                                "# Plasmids in the Deposit", "Addgene Contact", "Country"]

        self.DEFAULT_DIR = self.get_dropbox_dir()

    def get_dropbox_dir(self):
        """
            Windows and Mac get dropox dir for Business or fallback to personal
        """
        if os.name == "nt":
            dropbox_file = os.path.join(os.getenv('APPDATA'), 'Dropbox', 'info.json')
        else:
            dropbox_file = os.path.expanduser("~/.dropbox/info.json")
        with open(dropbox_file) as dbf:
            dbconfig = json.loads(dbf.read())

        if "business" in dbconfig:
            dropbox_dir = dbconfig['business']['path'] + "/*.xls"
        elif "personal" in dbconfig:
            dropbox_dir = dbconfig['personal']['path'] + "/*.xls"
        else:
            dropbox_dir = os.path.expanduser("~")

        return dropbox_dir

    def _get_dataframe_by_extension(self, path, date_cols):
        """
            Gets a dataframe either by .csv, or .xls(x),
            or erroring and exiting.
        """
        _, ext = os.path.splitext(path)

        if ext == ".csv":
            df = pd.read_csv(path, parse_dates=date_cols, encoding='utf-8')
        elif ext in [".xlsx", ".xls"]:
            df = pd.read_excel(path, parse_dates=date_cols, encoding='utf-8')
        else:
            easygui.msgbox("File was not of type {0}.\nQuitting".format(
                " ".join(self.ACCEPTABLE_EXTENSIONS)),
                "ERROR")
            sys.exit(1)
        return df

    def get_dataframes(self):
        """
            This gets the Salesforce and the Deposit dataframes.
            Then it does some cleanup of the columns
        """
        salesforce_df, deposit_df = self.get_files()

        # Get rid of the footer that Salesforce adds.
        if self.footer_length:
            length_with_footer = len(salesforce_df.index)
            salesforce_df = salesforce_df.head(length_with_footer - self.footer_length)

        # Clean up Salesforce
        salesforce_df.sort(self.SF_DATE_COLUMN, ascending=1)
        salesforce_df["Full Name"] = salesforce_df["First Name"].map(unicode) + " " + salesforce_df["Last Name"]

        # Cleanup Deposit Data
        deposit_df['Org Name'].fillna('', inplace=True)
        deposit_df.sort(self.DP_DATE_COLUMN, ascending=1)
        deposit_df['PI_Name'].astype(unicode)

        # Cleanup not needed columns
        for col in self.SF_TRIM_COLUMNS:
            del salesforce_df[col]
        for col in self.DP_TRIM_COLUMNS:
            del deposit_df[col]

        return salesforce_df, deposit_df

    def get_files(self):
        salesforce_data_name = easygui.fileopenbox("Salesforce Export",
                                                   default=self.DEFAULT_DIR,
                                                   filetypes=self.ACCEPTABLE_EXTENSIONS)
        if salesforce_data_name == ".":
            easygui.msgbox("You did not select a Salesforce Export, stopping program.",
                           "Good Bye")
            sys.exit(1)
        salesforce_df = self._get_dataframe_by_extension(salesforce_data_name, date_cols=[4, 5])
        if set(self.REQUIRED_SF_COLUMNS) < set(salesforce_df.columns):
            L.info("Proper columns")
        else:
            L.info("Wrong columns")
            easygui.msgbox("At a minimum, the Salesforce file must have the following columns:\n\n"
                           "{0}\n\n"
                           "Please re-run and select a proper file.".format(", ".join(self.REQUIRED_SF_COLUMNS)),
                           "Incorrect columns")
            sys.exit(1)

        deposit_data_name = easygui.fileopenbox("Deposit Data",
                                                default=self.DEFAULT_DIR,
                                                filetypes=self.ACCEPTABLE_EXTENSIONS)
        if deposit_data_name == ".":
            easygui.msgbox("You did not select a Deposit Data Export, stopping program.",
                           "Good Bye")
            sys.exit(1)
        deposit_df = self._get_dataframe_by_extension(deposit_data_name, date_cols=[7, 8])
        if set(self.REQUIRED_DP_COLUMNS) < set(deposit_df.columns):
            L.info("Proper columns")
        else:
            L.info("Wrong columns")
            easygui.msgbox("At a minimum, the Deposit Data file must have the following columns:\n\n"
                           "{0}\n\n"
                           "Please re-run and select a proper file.".format(", ".join(self.REQUIRED_DP_COLUMNS)),
                           "Incorrect columns")
            sys.exit(1)

        return salesforce_df, deposit_df

    def get_filtered(self, filtered_df, sf_row, pi_name, pi_org, kind):
        if kind == "PI":
            filter_column = self.PI_COLUMN
            filter_value = pi_name
        elif kind == "ORG":
            filter_column = self.ORG_COLUMN
            filter_value = pi_org

        name_match = filtered_df[filtered_df[filter_column] == filter_value]

        single = []
        double = []
        if not name_match.empty:
            for _, row in name_match.iterrows():
                data = {
                    "Addgene Assigned": sf_row['Assigned'],
                    "Plasmid ID": row['Plasmid ID'],
                    "Deposit ID": row['Deposit ID'],
                    "Institute": row['Org Name'],
                    "PI Name": row['PI_Name'],
                    "Date Received": row[self.DP_DATE_COLUMN],
                    "Original Date": sf_row[self.SF_DATE_COLUMN],
                    "Original ORG": pi_org,
                    "Original PI": pi_name,
                }
                if (data['Institute'] == data['Original ORG']) and \
                        (data['PI Name'] == data['Original PI']):
                    double.append(data)
                else:
                    single.append(data)
        return single, double

    def get_attribution_dataframes(self):
        salesforce, dep = self.get_dataframes()
        name_matches = []
        org_matches = []
        double_matches = []
        mismatches = []

        # Iterate through the Salesforce report as the master document
        for index, sf_row in salesforce.iterrows():
            # Get a start date and an end date for filtering.
            start_date = sf_row[self.SF_DATE_COLUMN]
            end_date = start_date + relativedelta(months=self.months)

            start = dep[self.DP_DATE_COLUMN].searchsorted(start_date)[0]
            end = dep[self.DP_DATE_COLUMN].searchsorted(end_date)[0]

            # Filter the deposit data to grab only things within that timeframe.
            filtered_df = dep.ix[start:end]

            # Variables for short names, and not having to type index a lot.
            pi_name = unicode(sf_row['Full Name'])
            pi_org = sf_row['Account Name']

            # Get matches by the PI's name
            by_name, by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org, kind="PI")
            name_matches.extend(by_name)
            mismatches.extend(by_name)
            double_matches.extend(by_both)

            # Get matches by the organization name
            by_org, by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org, kind="ORG")
            org_matches.extend(by_org)
            mismatches.extend(by_org)
            double_matches.extend(by_both)

        return (
            ("PI", pd.DataFrame(name_matches, columns=self.OUTPUT_COLUMN_ORDER)),
            ("Institute", pd.DataFrame(org_matches, columns=self.OUTPUT_COLUMN_ORDER)),
            ("Double", pd.DataFrame(double_matches, columns=self.OUTPUT_COLUMN_ORDER)),
            ("Single", pd.DataFrame(mismatches, columns=self.OUTPUT_COLUMN_ORDER))
        )

    def run(self):
        frames = self.get_attribution_dataframes()

        dirname = easygui.diropenbox("Where to save reports?", "Select Report Output Directory", self.DEFAULT_DIR)
        if not dirname:
            dirname = self.DEFAULT_DIR

        for key, df in frames:
            fname = '{0}_Attribution_Report_{1}_Match.xlsx'.format(datetime.date.today(), key)

            xls_path = os.path.join(dirname, fname)
            print "Writing", xls_path

            writer = pd.ExcelWriter(xls_path,
                                    engine='xlsxwriter')
            df.to_excel(writer,
                        sheet_name='Sheet1',
                        index=False)
            writer.save()


def main():
    try:
        report = AttributionReport(credentials_file="credentials.json",
                                   months=6,
                                   footer_length=6)
        report.run()
    except:
        easygui.exceptionbox()


if __name__ == '__main__':
    main()