Compare commits
1 Commits
10ecf10ac6
...
e3d6952517
Author | SHA1 | Date |
---|---|---|
Tyrel Souza | e3d6952517 |
|
@ -0,0 +1,3 @@
|
|||
*.pyc
|
||||
.idea
|
||||
build*
|
|
@ -0,0 +1,221 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import datetime
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
from dateutil.relativedelta import relativedelta
|
||||
|
||||
L = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AttributionReport(object):
|
||||
def __init__(self, months=6, footer_length=None):
|
||||
self.months = months
|
||||
self.footer_length = footer_length
|
||||
|
||||
self.SF_DATE_COLUMN = "Date"
|
||||
self.DP_DATE_COLUMN = "Date Received"
|
||||
self.PI_COLUMN = "PI_Name"
|
||||
self.ORG_COLUMN = "Org Name"
|
||||
# Output the XLSX in this order
|
||||
|
||||
self.OUTPUT_COLUMN_ORDER = ["Addgene Assigned", "Plasmid ID", "Deposit ID", "Institute", "PI Name",
|
||||
"Date Received", "Original Date", "Original ORG", "Original PI"]
|
||||
|
||||
self.ACCEPTABLE_EXTENSIONS = ["*.csv", "*.xls", "*.xlsx"]
|
||||
|
||||
# columns that need to be in the files
|
||||
self.REQUIRED_SF_COLUMNS = ["First Name", "Last Name", "Account Name", "Date", "Assigned"]
|
||||
self.REQUIRED_DP_COLUMNS = ["Org Name", "Deposit ID", "Plasmid ID", "PI_Name", "Date Received"]
|
||||
|
||||
# After load and merging, delete these columns
|
||||
self.SF_TRIM_COLUMNS = ["Subject", "Created Date", "LIMS Organization ID",
|
||||
"Account Description"]
|
||||
self.DP_TRIM_COLUMNS = ["Org ID", "Deposit Status", "PI_ID", "Date Available", "# Orders",
|
||||
"# Plasmids in the Deposit", "Addgene Contact", "Country"]
|
||||
|
||||
self.salesforce_df = None
|
||||
self.deposit_df = None
|
||||
self.output_dir = None
|
||||
self.frames = None
|
||||
|
||||
def _get_dataframe_by_extension(self, path, date_cols):
|
||||
"""
|
||||
Gets a dataframe either by .csv, or .xls(x),
|
||||
or erroring and exiting.
|
||||
"""
|
||||
_, ext = os.path.splitext(path)
|
||||
|
||||
if ext == ".csv":
|
||||
df = pd.read_csv(path, parse_dates=date_cols, encoding='utf-8')
|
||||
elif ext in [".xlsx", ".xls"]:
|
||||
df = pd.read_excel(path, parse_dates=date_cols, encoding='utf-8')
|
||||
else:
|
||||
raise Exception("File was not of type {0}.\nQuitting".format(
|
||||
" ".join(self.ACCEPTABLE_EXTENSIONS)))
|
||||
return df
|
||||
|
||||
def set_dataframe_sf(self, fname):
|
||||
self.salesforce_df = None
|
||||
try:
|
||||
salesforce_df = self._get_dataframe_by_extension(fname, date_cols=[self.SF_DATE_COLUMN, ])
|
||||
except IndexError:
|
||||
return False
|
||||
except ValueError:
|
||||
return False
|
||||
except:
|
||||
raise
|
||||
|
||||
if set(self.REQUIRED_SF_COLUMNS) < set(salesforce_df.columns):
|
||||
self.salesforce_df = salesforce_df
|
||||
return True
|
||||
L.info("Wrong columns")
|
||||
return False
|
||||
|
||||
def set_dataframe_deposit(self, fname):
|
||||
self.deposit_df = None
|
||||
try:
|
||||
deposit_df = self._get_dataframe_by_extension(fname, date_cols=[self.DP_DATE_COLUMN, ])
|
||||
except IndexError:
|
||||
return False
|
||||
except ValueError:
|
||||
return False
|
||||
except:
|
||||
raise
|
||||
if set(self.REQUIRED_DP_COLUMNS) < set(deposit_df.columns):
|
||||
self.deposit_df = deposit_df
|
||||
return True
|
||||
L.info("Wrong columns")
|
||||
return False
|
||||
|
||||
def set_output_dir(self, dir):
|
||||
self.output_dir = dir
|
||||
|
||||
def get_dataframes(self):
|
||||
salesforce_df, deposit_df = self.clean_dataframes()
|
||||
return salesforce_df, deposit_df
|
||||
|
||||
def clean_dataframes(self):
|
||||
# Get rid of the footer that Salesforce adds.
|
||||
if self.footer_length:
|
||||
length_with_footer = len(self.salesforce_df.index)
|
||||
self.salesforce_df = self.salesforce_df.head(length_with_footer - self.footer_length)
|
||||
|
||||
# Clean up Salesforce
|
||||
self.salesforce_df.sort(self.SF_DATE_COLUMN, ascending=1)
|
||||
|
||||
# Cleanup Deposit Data
|
||||
self.deposit_df['Org Name'].fillna('', inplace=True)
|
||||
self.deposit_df.sort(self.DP_DATE_COLUMN, ascending=1)
|
||||
self.deposit_df['PI_Name'].astype(unicode)
|
||||
|
||||
# Cleanup not needed columns
|
||||
for col in self.SF_TRIM_COLUMNS:
|
||||
del self.salesforce_df[col]
|
||||
for col in self.DP_TRIM_COLUMNS:
|
||||
del self.deposit_df[col]
|
||||
|
||||
def get_filtered(self, filtered_df, sf_row, pi_name, pi_org, org=False):
|
||||
"""
|
||||
Assume kind is PI by default.
|
||||
Filter where either the PI and PI match, or the Org and Org match
|
||||
If both match, add it to the the double list
|
||||
if only one matches, add it to the single list.
|
||||
"""
|
||||
filter_column = self.PI_COLUMN
|
||||
filter_value = pi_name
|
||||
single, double = [], []
|
||||
|
||||
if org:
|
||||
filter_column = self.ORG_COLUMN
|
||||
filter_value = pi_org
|
||||
|
||||
name_match = filtered_df[filtered_df[filter_column] == filter_value]
|
||||
|
||||
if not name_match.empty:
|
||||
for _, row in name_match.iterrows():
|
||||
data = {
|
||||
"Addgene Assigned": sf_row['Assigned'],
|
||||
"Plasmid ID": row['Plasmid ID'],
|
||||
"Deposit ID": row['Deposit ID'],
|
||||
"Institute": row['Org Name'],
|
||||
"PI Name": row['PI_Name'],
|
||||
"Date Received": row[self.DP_DATE_COLUMN],
|
||||
"Original Date": sf_row[self.SF_DATE_COLUMN],
|
||||
"Original ORG": pi_org,
|
||||
"Original PI": pi_name,
|
||||
}
|
||||
if (data['Institute'] == data['Original ORG']) and \
|
||||
(data['PI Name'] == data['Original PI']):
|
||||
double.append(data)
|
||||
else:
|
||||
single.append(data)
|
||||
return single, double
|
||||
|
||||
def get_attribution_dataframes(self):
|
||||
self.clean_dataframes()
|
||||
|
||||
name_matches = []
|
||||
org_matches = []
|
||||
double_matches = []
|
||||
mismatches = []
|
||||
|
||||
# Iterate through the Salesforce report as the master document
|
||||
for index, sf_row in self.salesforce_df.iterrows():
|
||||
# Get a start date and an end date for filtering.
|
||||
start_date = sf_row[self.SF_DATE_COLUMN]
|
||||
end_date = start_date + relativedelta(months=self.months)
|
||||
|
||||
start = self.deposit_df[self.DP_DATE_COLUMN].searchsorted(start_date)[0]
|
||||
end = self.deposit_df[self.DP_DATE_COLUMN].searchsorted(end_date)[0]
|
||||
|
||||
# Filter the deposit data to grab only things within that timeframe.
|
||||
filtered_df = self.deposit_df.ix[start:end]
|
||||
|
||||
# Variables for short names, and not having to type index a lot.
|
||||
pi_name = unicode(sf_row['First Name']) + " " + unicode(sf_row['Last Name'])
|
||||
pi_org = sf_row['Account Name']
|
||||
|
||||
# Get matches by the PI's name
|
||||
by_name, pi_by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org)
|
||||
name_matches.extend(by_name)
|
||||
mismatches.extend(by_name)
|
||||
double_matches.extend(pi_by_both)
|
||||
|
||||
# Get matches by the organization name
|
||||
by_org, org_by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org, org=True)
|
||||
org_matches.extend(by_org)
|
||||
mismatches.extend(by_org)
|
||||
double_matches.extend(org_by_both)
|
||||
|
||||
return (
|
||||
("PI", pd.DataFrame(name_matches, columns=self.OUTPUT_COLUMN_ORDER)),
|
||||
("Institute", pd.DataFrame(org_matches, columns=self.OUTPUT_COLUMN_ORDER)),
|
||||
("Double", pd.DataFrame(double_matches, columns=self.OUTPUT_COLUMN_ORDER)),
|
||||
("Single", pd.DataFrame(mismatches, columns=self.OUTPUT_COLUMN_ORDER))
|
||||
)
|
||||
|
||||
def run(self):
|
||||
self.frames = None
|
||||
self.frames = self.get_attribution_dataframes()
|
||||
|
||||
def save(self):
|
||||
for key, df in self.frames:
|
||||
fname = '{0}_Attribution_Report_{1}_Match.xlsx'.format(datetime.date.today(), key)
|
||||
|
||||
output_path = os.path.join(self.output_dir, fname)
|
||||
|
||||
deduped_df = df.drop_duplicates()
|
||||
|
||||
with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
|
||||
deduped_df.to_excel(writer, sheet_name='Sheet1', index=False)
|
||||
|
||||
if __name__ == '__main__':
|
||||
app = AttributionReport(months=6, footer_length=6)
|
||||
app.set_dataframe_deposit("/Users/tyrelsouza/Dropbox (Addgene)/Addgene Shared/Dev/Attribution Report/deposit_data.csv")
|
||||
app.set_dataframe_sf("/Users/tyrelsouza/Dropbox (Addgene)/Addgene Shared/Dev/Attribution Report/salesforce_report.xlsx")
|
||||
app.set_output_dir("/Users/tyrelsouza/Dropbox (Addgene)/Addgene Shared/Dev/Attribution Report/Output/")
|
||||
app.run()
|
||||
app.save()
|
||||
|
254
main.py
254
main.py
|
@ -1,254 +0,0 @@
|
|||
import os
|
||||
import json
|
||||
import datetime
|
||||
import logging
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
from dateutil.relativedelta import relativedelta
|
||||
import easygui
|
||||
|
||||
L = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AttributionReport(object):
|
||||
def __init__(self, credentials_file, months=6, footer_length=None):
|
||||
self.months = months
|
||||
self.footer_length = footer_length
|
||||
|
||||
self.SF_DATE_COLUMN = "Date"
|
||||
self.DP_DATE_COLUMN = "Date Received"
|
||||
self.PI_COLUMN = "PI_Name"
|
||||
self.ORG_COLUMN = "Org Name"
|
||||
# Output the XLSX in this order
|
||||
|
||||
self.OUTPUT_COLUMN_ORDER = ["Addgene Assigned", "Plasmid ID", "Deposit ID", "Institute", "PI Name",
|
||||
"Date Received", "Original Date", "Original ORG", "Original PI"]
|
||||
|
||||
self.ACCEPTABLE_EXTENSIONS = ["*.csv", "*.xls", "*.xlsx"]
|
||||
|
||||
# columns that need to be in the files
|
||||
self.REQUIRED_SF_COLUMNS = ["First Name", "Last Name", "Account Name", "Date", "Assigned"]
|
||||
self.REQUIRED_DP_COLUMNS = ["Org Name", "Deposit ID", "Plasmid ID", "PI_Name", "Date Received"]
|
||||
|
||||
# After load and merging, delete these columns
|
||||
self.SF_TRIM_COLUMNS = ["Subject", "First Name", "Last Name", "Created Date", "LIMS Organization ID",
|
||||
"Account Description"]
|
||||
self.DP_TRIM_COLUMNS = ["Org ID", "Deposit Status", "PI_ID", "Date Available", "# Orders",
|
||||
"# Plasmids in the Deposit", "Addgene Contact", "Country"]
|
||||
|
||||
self.DEFAULT_DIR = self.get_dropbox_dir()
|
||||
|
||||
def get_dropbox_dir(self):
|
||||
"""
|
||||
Windows and Mac get dropox dir for Business or fallback to personal
|
||||
"""
|
||||
if os.name == "nt":
|
||||
dropbox_file = os.path.join(os.getenv('APPDATA'), 'Dropbox', 'info.json')
|
||||
else:
|
||||
dropbox_file = os.path.expanduser("~/.dropbox/info.json")
|
||||
with open(dropbox_file) as dbf:
|
||||
dbconfig = json.loads(dbf.read())
|
||||
|
||||
if "business" in dbconfig:
|
||||
dropbox_dir = dbconfig['business']['path'] + "/*.xls"
|
||||
elif "personal" in dbconfig:
|
||||
dropbox_dir = dbconfig['personal']['path'] + "/*.xls"
|
||||
else:
|
||||
dropbox_dir = os.path.expanduser("~")
|
||||
|
||||
return dropbox_dir
|
||||
|
||||
def _get_dataframe_by_extension(self, path, date_cols):
|
||||
"""
|
||||
Gets a dataframe either by .csv, or .xls(x),
|
||||
or erroring and exiting.
|
||||
"""
|
||||
_, ext = os.path.splitext(path)
|
||||
|
||||
if ext == ".csv":
|
||||
df = pd.read_csv(path, parse_dates=date_cols, encoding='utf-8')
|
||||
elif ext in [".xlsx", ".xls"]:
|
||||
df = pd.read_excel(path, parse_dates=date_cols, encoding='utf-8')
|
||||
else:
|
||||
easygui.msgbox("File was not of type {0}.\nQuitting".format(
|
||||
" ".join(self.ACCEPTABLE_EXTENSIONS)),
|
||||
"ERROR")
|
||||
sys.exit(1)
|
||||
return df
|
||||
|
||||
def get_dataframes(self):
|
||||
"""
|
||||
This gets the Salesforce and the Deposit dataframes.
|
||||
Then it does some cleanup of the columns
|
||||
"""
|
||||
salesforce_df, deposit_df = self.get_files()
|
||||
|
||||
# Get rid of the footer that Salesforce adds.
|
||||
if self.footer_length:
|
||||
length_with_footer = len(salesforce_df.index)
|
||||
salesforce_df = salesforce_df.head(length_with_footer - self.footer_length)
|
||||
|
||||
# Clean up Salesforce
|
||||
salesforce_df.sort(self.SF_DATE_COLUMN, ascending=1)
|
||||
salesforce_df["Full Name"] = salesforce_df["First Name"].map(unicode) + " " + salesforce_df["Last Name"]
|
||||
|
||||
# Cleanup Deposit Data
|
||||
deposit_df['Org Name'].fillna('', inplace=True)
|
||||
deposit_df.sort(self.DP_DATE_COLUMN, ascending=1)
|
||||
deposit_df['PI_Name'].astype(unicode)
|
||||
|
||||
# Cleanup not needed columns
|
||||
for col in self.SF_TRIM_COLUMNS:
|
||||
del salesforce_df[col]
|
||||
for col in self.DP_TRIM_COLUMNS:
|
||||
del deposit_df[col]
|
||||
|
||||
return salesforce_df, deposit_df
|
||||
|
||||
def get_files(self):
|
||||
salesforce_data_name = easygui.fileopenbox("Salesforce Export",
|
||||
default=self.DEFAULT_DIR,
|
||||
filetypes=self.ACCEPTABLE_EXTENSIONS)
|
||||
if salesforce_data_name == ".":
|
||||
easygui.msgbox("You did not select a Salesforce Export, stopping program.",
|
||||
"Good Bye")
|
||||
sys.exit(1)
|
||||
salesforce_df = self._get_dataframe_by_extension(salesforce_data_name, date_cols=[4, 5])
|
||||
if set(self.REQUIRED_SF_COLUMNS) < set(salesforce_df.columns):
|
||||
L.info("Proper columns")
|
||||
else:
|
||||
L.info("Wrong columns")
|
||||
easygui.msgbox("At a minimum, the Salesforce file must have the following columns:\n\n"
|
||||
"{0}\n\n"
|
||||
"Please re-run and select a proper file.".format(", ".join(self.REQUIRED_SF_COLUMNS)),
|
||||
"Incorrect columns")
|
||||
sys.exit(1)
|
||||
|
||||
deposit_data_name = easygui.fileopenbox("Deposit Data",
|
||||
default=self.DEFAULT_DIR,
|
||||
filetypes=self.ACCEPTABLE_EXTENSIONS)
|
||||
if deposit_data_name == ".":
|
||||
easygui.msgbox("You did not select a Deposit Data Export, stopping program.",
|
||||
"Good Bye")
|
||||
sys.exit(1)
|
||||
deposit_df = self._get_dataframe_by_extension(deposit_data_name, date_cols=[7, 8])
|
||||
if set(self.REQUIRED_DP_COLUMNS) < set(deposit_df.columns):
|
||||
L.info("Proper columns")
|
||||
else:
|
||||
L.info("Wrong columns")
|
||||
easygui.msgbox("At a minimum, the Deposit Data file must have the following columns:\n\n"
|
||||
"{0}\n\n"
|
||||
"Please re-run and select a proper file.".format(", ".join(self.REQUIRED_DP_COLUMNS)),
|
||||
"Incorrect columns")
|
||||
sys.exit(1)
|
||||
|
||||
return salesforce_df, deposit_df
|
||||
|
||||
def get_filtered(self, filtered_df, sf_row, pi_name, pi_org, kind):
|
||||
if kind == "PI":
|
||||
filter_column = self.PI_COLUMN
|
||||
filter_value = pi_name
|
||||
elif kind == "ORG":
|
||||
filter_column = self.ORG_COLUMN
|
||||
filter_value = pi_org
|
||||
|
||||
name_match = filtered_df[filtered_df[filter_column] == filter_value]
|
||||
|
||||
single = []
|
||||
double = []
|
||||
if not name_match.empty:
|
||||
for _, row in name_match.iterrows():
|
||||
data = {
|
||||
"Addgene Assigned": sf_row['Assigned'],
|
||||
"Plasmid ID": row['Plasmid ID'],
|
||||
"Deposit ID": row['Deposit ID'],
|
||||
"Institute": row['Org Name'],
|
||||
"PI Name": row['PI_Name'],
|
||||
"Date Received": row[self.DP_DATE_COLUMN],
|
||||
"Original Date": sf_row[self.SF_DATE_COLUMN],
|
||||
"Original ORG": pi_org,
|
||||
"Original PI": pi_name,
|
||||
}
|
||||
if (data['Institute'] == data['Original ORG']) and \
|
||||
(data['PI Name'] == data['Original PI']):
|
||||
double.append(data)
|
||||
else:
|
||||
single.append(data)
|
||||
return single, double
|
||||
|
||||
def get_attribution_dataframes(self):
|
||||
salesforce, dep = self.get_dataframes()
|
||||
name_matches = []
|
||||
org_matches = []
|
||||
double_matches = []
|
||||
mismatches = []
|
||||
|
||||
# Iterate through the Salesforce report as the master document
|
||||
for index, sf_row in salesforce.iterrows():
|
||||
# Get a start date and an end date for filtering.
|
||||
start_date = sf_row[self.SF_DATE_COLUMN]
|
||||
end_date = start_date + relativedelta(months=self.months)
|
||||
|
||||
start = dep[self.DP_DATE_COLUMN].searchsorted(start_date)[0]
|
||||
end = dep[self.DP_DATE_COLUMN].searchsorted(end_date)[0]
|
||||
|
||||
# Filter the deposit data to grab only things within that timeframe.
|
||||
filtered_df = dep.ix[start:end]
|
||||
|
||||
# Variables for short names, and not having to type index a lot.
|
||||
pi_name = unicode(sf_row['Full Name'])
|
||||
pi_org = sf_row['Account Name']
|
||||
|
||||
# Get matches by the PI's name
|
||||
by_name, by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org, kind="PI")
|
||||
name_matches.extend(by_name)
|
||||
mismatches.extend(by_name)
|
||||
double_matches.extend(by_both)
|
||||
|
||||
# Get matches by the organization name
|
||||
by_org, by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org, kind="ORG")
|
||||
org_matches.extend(by_org)
|
||||
mismatches.extend(by_org)
|
||||
double_matches.extend(by_both)
|
||||
|
||||
return (
|
||||
("PI", pd.DataFrame(name_matches, columns=self.OUTPUT_COLUMN_ORDER)),
|
||||
("Institute", pd.DataFrame(org_matches, columns=self.OUTPUT_COLUMN_ORDER)),
|
||||
("Double", pd.DataFrame(double_matches, columns=self.OUTPUT_COLUMN_ORDER)),
|
||||
("Single", pd.DataFrame(mismatches, columns=self.OUTPUT_COLUMN_ORDER))
|
||||
)
|
||||
|
||||
def run(self):
|
||||
frames = self.get_attribution_dataframes()
|
||||
|
||||
dirname = easygui.diropenbox("Where to save reports?", "Select Report Output Directory", self.DEFAULT_DIR)
|
||||
if not dirname:
|
||||
dirname = self.DEFAULT_DIR
|
||||
|
||||
for key, df in frames:
|
||||
fname = '{0}_Attribution_Report_{1}_Match.xlsx'.format(datetime.date.today(), key)
|
||||
|
||||
xls_path = os.path.join(dirname, fname)
|
||||
print "Writing", xls_path
|
||||
|
||||
writer = pd.ExcelWriter(xls_path,
|
||||
engine='xlsxwriter')
|
||||
df.to_excel(writer,
|
||||
sheet_name='Sheet1',
|
||||
index=False)
|
||||
writer.save()
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
report = AttributionReport(credentials_file="credentials.json",
|
||||
months=6,
|
||||
footer_length=6)
|
||||
report.run()
|
||||
except:
|
||||
easygui.exceptionbox()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -1,9 +1,4 @@
|
|||
dropbox
|
||||
pandas
|
||||
xlrd
|
||||
python-dateutil
|
||||
xlwt
|
||||
#openpyxl==1.8.6
|
||||
xlsxwriter
|
||||
progressbar==2.3
|
||||
easygui
|
||||
XlsxWriter==0.7.3
|
||||
pandas==0.16.2
|
||||
python-dateutil==2.4.2
|
||||
xlrd==0.9.4
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
__author__ = 'tyrelsouza'
|
||||
import os
|
||||
import json
|
||||
|
||||
|
||||
def get_dropbox_dir():
|
||||
"""
|
||||
Windows and Mac get dropox dir for Business or fallback to personal
|
||||
"""
|
||||
if os.name == "nt":
|
||||
dropbox_file = os.path.join(os.getenv('APPDATA'), 'Dropbox', 'info.json')
|
||||
else:
|
||||
dropbox_file = os.path.expanduser("~/.dropbox/info.json")
|
||||
|
||||
with open(dropbox_file) as dbf:
|
||||
dbconfig = json.loads(dbf.read())
|
||||
|
||||
if "business" in dbconfig:
|
||||
dropbox_dir = dbconfig['business']['path']
|
||||
elif "personal" in dbconfig:
|
||||
dropbox_dir = dbconfig['personal']['path']
|
||||
else:
|
||||
dropbox_dir = os.path.expanduser("~")
|
||||
|
||||
return dropbox_dir
|
Loading…
Reference in New Issue