attribution_report/attribution.py

218 lines
8.1 KiB
Python
Raw Normal View History

2015-07-17 04:15:20 +00:00
# -*- coding: utf-8 -*-
2015-07-16 18:13:45 +00:00
import os
2015-07-16 19:10:17 +00:00
import datetime
import logging
2015-07-17 14:48:17 +00:00
import subprocess
2015-07-16 18:13:45 +00:00
import pandas as pd
from dateutil.relativedelta import relativedelta
2015-07-17 14:48:17 +00:00
2015-07-16 19:10:17 +00:00
L = logging.getLogger(__name__)
2015-07-16 18:13:45 +00:00
2015-07-16 18:13:45 +00:00
class AttributionReport(object):
def __init__(self, months=6, footer_length=None):
2015-07-16 18:13:45 +00:00
self.months = months
self.footer_length = footer_length
2015-07-16 20:21:22 +00:00
2015-07-16 20:24:08 +00:00
self.SF_DATE_COLUMN = "Date"
self.DP_DATE_COLUMN = "Date Received"
2015-07-16 18:13:45 +00:00
self.PI_COLUMN = "PI_Name"
self.ORG_COLUMN = "Org Name"
2015-07-16 20:24:08 +00:00
# Output the XLSX in this order
self.OUTPUT_COLUMN_ORDER = ["Addgene Assigned", "Plasmid ID", "Deposit ID", "Institute", "PI Name",
"Date Received", "Original Date", "Original ORG", "Original PI"]
self.ACCEPTABLE_EXTENSIONS = ["*.csv", "*.xls", "*.xlsx"]
2015-07-16 20:21:22 +00:00
# columns that need to be in the files
2015-07-16 20:24:08 +00:00
self.REQUIRED_SF_COLUMNS = ["First Name", "Last Name", "Account Name", "Date", "Assigned"]
self.REQUIRED_DP_COLUMNS = ["Org Name", "Deposit ID", "Plasmid ID", "PI_Name", "Date Received"]
2015-07-16 20:21:22 +00:00
# After load and merging, delete these columns
self.SF_TRIM_COLUMNS = ["Subject", "Created Date", "LIMS Organization ID",
2015-07-16 20:21:22 +00:00
"Account Description"]
2015-07-16 20:24:08 +00:00
self.DP_TRIM_COLUMNS = ["Org ID", "Deposit Status", "PI_ID", "Date Available", "# Orders",
"# Plasmids in the Deposit", "Addgene Contact", "Country"]
2015-07-16 20:21:22 +00:00
2015-07-17 20:17:15 +00:00
self.salesforce_df = None
self.deposit_df = None
2015-07-17 20:50:29 +00:00
self.output_dir = None
self.frames = None
2015-07-16 20:21:22 +00:00
def _get_dataframe_by_extension(self, path, date_cols):
"""
Gets a dataframe either by .csv, or .xls(x),
or erroring and exiting.
"""
_, ext = os.path.splitext(path)
2015-07-16 18:13:45 +00:00
2015-07-16 20:21:22 +00:00
if ext == ".csv":
df = pd.read_csv(path, parse_dates=date_cols, encoding='utf-8')
elif ext in [".xlsx", ".xls"]:
df = pd.read_excel(path, parse_dates=date_cols, encoding='utf-8')
2015-07-16 18:13:45 +00:00
else:
2015-07-17 20:17:15 +00:00
raise Exception("File was not of type {0}.\nQuitting".format(
" ".join(self.ACCEPTABLE_EXTENSIONS)))
2015-07-16 18:13:45 +00:00
return df
2015-07-17 20:17:15 +00:00
def set_dataframe_sf(self, fname):
self.salesforce_df = None
2015-07-17 20:50:29 +00:00
try:
salesforce_df = self._get_dataframe_by_extension(fname, date_cols=[self.SF_DATE_COLUMN, ])
except IndexError:
return False
except ValueError:
return False
2015-07-17 20:50:29 +00:00
except:
raise
2015-07-16 20:21:22 +00:00
if set(self.REQUIRED_SF_COLUMNS) < set(salesforce_df.columns):
2015-07-17 20:17:15 +00:00
self.salesforce_df = salesforce_df
return True
L.info("Wrong columns")
return False
def set_dataframe_deposit(self, fname):
self.deposit_df = None
2015-07-17 20:50:29 +00:00
try:
deposit_df = self._get_dataframe_by_extension(fname, date_cols=[self.DP_DATE_COLUMN, ])
except IndexError:
return False
except:
raise
2015-07-16 20:21:22 +00:00
if set(self.REQUIRED_DP_COLUMNS) < set(deposit_df.columns):
2015-07-17 20:17:15 +00:00
self.deposit_df = deposit_df
return True
L.info("Wrong columns")
return False
def set_output_dir(self, dir):
self.output_dir = dir
def get_dataframes(self):
salesforce_df, deposit_df = self.clean_dataframes()
return salesforce_df, deposit_df
2015-07-17 20:17:15 +00:00
def clean_dataframes(self):
# Get rid of the footer that Salesforce adds.
if self.footer_length:
2015-07-17 20:17:15 +00:00
length_with_footer = len(self.salesforce_df.index)
self.salesforce_df = self.salesforce_df.head(length_with_footer - self.footer_length)
# Clean up Salesforce
2015-07-17 20:17:15 +00:00
self.salesforce_df.sort(self.SF_DATE_COLUMN, ascending=1)
# Cleanup Deposit Data
2015-07-17 20:17:15 +00:00
self.deposit_df['Org Name'].fillna('', inplace=True)
self.deposit_df.sort(self.DP_DATE_COLUMN, ascending=1)
self.deposit_df['PI_Name'].astype(unicode)
# Cleanup not needed columns
for col in self.SF_TRIM_COLUMNS:
2015-07-17 20:17:15 +00:00
del self.salesforce_df[col]
for col in self.DP_TRIM_COLUMNS:
2015-07-17 20:17:15 +00:00
del self.deposit_df[col]
2015-07-16 18:13:45 +00:00
def get_filtered(self, filtered_df, sf_row, pi_name, pi_org, org=False):
"""
Assume kind is PI by default.
Filter where either the PI and PI match, or the Org and Org match
If both match, add it to the the double list
if only one matches, add it to the single list.
"""
filter_column = self.PI_COLUMN
filter_value = pi_name
single, double = [], []
if org:
2015-07-16 18:13:45 +00:00
filter_column = self.ORG_COLUMN
filter_value = pi_org
name_match = filtered_df[filtered_df[filter_column] == filter_value]
if not name_match.empty:
for _, row in name_match.iterrows():
data = {
"Addgene Assigned": sf_row['Assigned'],
"Plasmid ID": row['Plasmid ID'],
"Deposit ID": row['Deposit ID'],
"Institute": row['Org Name'],
"PI Name": row['PI_Name'],
2015-07-16 20:24:08 +00:00
"Date Received": row[self.DP_DATE_COLUMN],
"Original Date": sf_row[self.SF_DATE_COLUMN],
2015-07-16 18:13:45 +00:00
"Original ORG": pi_org,
"Original PI": pi_name,
}
2015-07-16 19:10:17 +00:00
if (data['Institute'] == data['Original ORG']) and \
(data['PI Name'] == data['Original PI']):
double.append(data)
else:
single.append(data)
return single, double
2015-07-16 18:13:45 +00:00
def get_attribution_dataframes(self):
2015-07-17 20:17:15 +00:00
self.clean_dataframes()
2015-07-16 18:13:45 +00:00
name_matches = []
org_matches = []
2015-07-16 19:10:17 +00:00
double_matches = []
mismatches = []
2015-07-16 18:13:45 +00:00
# Iterate through the Salesforce report as the master document
2015-07-17 20:17:15 +00:00
for index, sf_row in self.salesforce_df.iterrows():
2015-07-16 18:13:45 +00:00
# Get a start date and an end date for filtering.
2015-07-16 20:24:08 +00:00
start_date = sf_row[self.SF_DATE_COLUMN]
2015-07-16 18:13:45 +00:00
end_date = start_date + relativedelta(months=self.months)
2015-07-17 20:17:15 +00:00
start = self.deposit_df[self.DP_DATE_COLUMN].searchsorted(start_date)[0]
end = self.deposit_df[self.DP_DATE_COLUMN].searchsorted(end_date)[0]
2015-07-16 18:13:45 +00:00
# Filter the deposit data to grab only things within that timeframe.
2015-07-17 20:17:15 +00:00
filtered_df = self.deposit_df.ix[start:end]
2015-07-16 18:13:45 +00:00
# Variables for short names, and not having to type index a lot.
2015-07-17 20:50:29 +00:00
pi_name = unicode(sf_row['First Name']) + " " + unicode(sf_row['Last Name'])
2015-07-16 18:13:45 +00:00
pi_org = sf_row['Account Name']
# Get matches by the PI's name
by_name, pi_by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org)
2015-07-16 18:13:45 +00:00
name_matches.extend(by_name)
2015-07-16 19:10:17 +00:00
mismatches.extend(by_name)
double_matches.extend(pi_by_both)
2015-07-16 18:13:45 +00:00
# Get matches by the organization name
by_org, org_by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org, org=True)
2015-07-16 18:13:45 +00:00
org_matches.extend(by_org)
2015-07-16 19:10:17 +00:00
mismatches.extend(by_org)
double_matches.extend(org_by_both)
2015-07-16 19:10:17 +00:00
2015-07-16 20:21:22 +00:00
return (
("PI", pd.DataFrame(name_matches, columns=self.OUTPUT_COLUMN_ORDER)),
2015-07-17 20:17:15 +00:00
("Institute", pd.DataFrame(org_matches, columns=self.OUTPUT_COLUMN_ORDER)),
2015-07-16 20:21:22 +00:00
("Double", pd.DataFrame(double_matches, columns=self.OUTPUT_COLUMN_ORDER)),
2015-07-17 20:17:15 +00:00
("Single", pd.DataFrame(mismatches, columns=self.OUTPUT_COLUMN_ORDER))
2015-07-16 20:21:22 +00:00
)
2015-07-16 18:13:45 +00:00
def run(self):
2015-07-17 20:50:29 +00:00
self.frames = None
self.frames = self.get_attribution_dataframes()
2015-07-16 19:10:17 +00:00
2015-07-17 20:50:29 +00:00
def save(self):
for key, df in self.frames:
2015-07-16 20:21:22 +00:00
fname = '{0}_Attribution_Report_{1}_Match.xlsx'.format(datetime.date.today(), key)
2015-07-16 19:10:17 +00:00
output_path = os.path.join(self.output_dir, fname)
2015-07-16 20:21:22 +00:00
deduped_df = df.drop_duplicates()
with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
deduped_df.to_excel(writer, sheet_name='Sheet1', index=False)
2015-07-16 19:10:17 +00:00
2015-07-17 20:17:15 +00:00
# Open the window where the files are
2015-07-17 14:48:17 +00:00
if os.name == "nt":
2015-07-17 20:17:15 +00:00
subprocess.call(["explorer", self.output_dir], shell=True)
2015-07-17 14:48:17 +00:00
else:
2015-07-17 20:17:15 +00:00
subprocess.call(["open", self.output_dir])