before gui

EasyGui interface

cleanup

Cleanup, rename, move functions around, context managers

store package versions

reqs

added encoding

compiled

renamed

Reveal file when done

Remove reveal

temp dont do big files

dont store full name, use first and last later

added gu8i

minsize

cleanup

Gui, utils, etc

cleanup from pyinstaller

gui done for now

filetypes fix

Progress bar, multithreading

GUI Chnages, progress bars, more error handling.

Add distribution

gitignore

add make mac

Added spec file

Make Win

fix ValueError bug

rebuild mac

Windows EXE

mac app rename

Readme

added readme updates, and example on attribution.py

delete setup

remove prints

remove threading

remove requirement
This commit is contained in:
Tyrel Souza 2015-07-16 15:10:17 -04:00 committed by Tyrel Souza
parent 53ee1caeee
commit 90673f2724
No known key found for this signature in database
GPG Key ID: F6582CF1308A2360
5 changed files with 253 additions and 174 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
*.pyc
.idea
build*

221
attribution.py Normal file
View File

@ -0,0 +1,221 @@
# -*- coding: utf-8 -*-
import os
import datetime
import logging
import pandas as pd
from dateutil.relativedelta import relativedelta
L = logging.getLogger(__name__)
class AttributionReport(object):
def __init__(self, months=6, footer_length=None):
self.months = months
self.footer_length = footer_length
self.SF_DATE_COLUMN = "Date"
self.DP_DATE_COLUMN = "Date Received"
self.PI_COLUMN = "PI_Name"
self.ORG_COLUMN = "Org Name"
# Output the XLSX in this order
self.OUTPUT_COLUMN_ORDER = ["Addgene Assigned", "Plasmid ID", "Deposit ID", "Institute", "PI Name",
"Date Received", "Original Date", "Original ORG", "Original PI"]
self.ACCEPTABLE_EXTENSIONS = ["*.csv", "*.xls", "*.xlsx"]
# columns that need to be in the files
self.REQUIRED_SF_COLUMNS = ["First Name", "Last Name", "Account Name", "Date", "Assigned"]
self.REQUIRED_DP_COLUMNS = ["Org Name", "Deposit ID", "Plasmid ID", "PI_Name", "Date Received"]
# After load and merging, delete these columns
self.SF_TRIM_COLUMNS = ["Subject", "Created Date", "LIMS Organization ID",
"Account Description"]
self.DP_TRIM_COLUMNS = ["Org ID", "Deposit Status", "PI_ID", "Date Available", "# Orders",
"# Plasmids in the Deposit", "Addgene Contact", "Country"]
self.salesforce_df = None
self.deposit_df = None
self.output_dir = None
self.frames = None
def _get_dataframe_by_extension(self, path, date_cols):
"""
Gets a dataframe either by .csv, or .xls(x),
or erroring and exiting.
"""
_, ext = os.path.splitext(path)
if ext == ".csv":
df = pd.read_csv(path, parse_dates=date_cols, encoding='utf-8')
elif ext in [".xlsx", ".xls"]:
df = pd.read_excel(path, parse_dates=date_cols, encoding='utf-8')
else:
raise Exception("File was not of type {0}.\nQuitting".format(
" ".join(self.ACCEPTABLE_EXTENSIONS)))
return df
def set_dataframe_sf(self, fname):
self.salesforce_df = None
try:
salesforce_df = self._get_dataframe_by_extension(fname, date_cols=[self.SF_DATE_COLUMN, ])
except IndexError:
return False
except ValueError:
return False
except:
raise
if set(self.REQUIRED_SF_COLUMNS) < set(salesforce_df.columns):
self.salesforce_df = salesforce_df
return True
L.info("Wrong columns")
return False
def set_dataframe_deposit(self, fname):
self.deposit_df = None
try:
deposit_df = self._get_dataframe_by_extension(fname, date_cols=[self.DP_DATE_COLUMN, ])
except IndexError:
return False
except ValueError:
return False
except:
raise
if set(self.REQUIRED_DP_COLUMNS) < set(deposit_df.columns):
self.deposit_df = deposit_df
return True
L.info("Wrong columns")
return False
def set_output_dir(self, dir):
self.output_dir = dir
def get_dataframes(self):
salesforce_df, deposit_df = self.clean_dataframes()
return salesforce_df, deposit_df
def clean_dataframes(self):
# Get rid of the footer that Salesforce adds.
if self.footer_length:
length_with_footer = len(self.salesforce_df.index)
self.salesforce_df = self.salesforce_df.head(length_with_footer - self.footer_length)
# Clean up Salesforce
self.salesforce_df.sort(self.SF_DATE_COLUMN, ascending=1)
# Cleanup Deposit Data
self.deposit_df['Org Name'].fillna('', inplace=True)
self.deposit_df.sort(self.DP_DATE_COLUMN, ascending=1)
self.deposit_df['PI_Name'].astype(unicode)
# Cleanup not needed columns
for col in self.SF_TRIM_COLUMNS:
del self.salesforce_df[col]
for col in self.DP_TRIM_COLUMNS:
del self.deposit_df[col]
def get_filtered(self, filtered_df, sf_row, pi_name, pi_org, org=False):
"""
Assume kind is PI by default.
Filter where either the PI and PI match, or the Org and Org match
If both match, add it to the the double list
if only one matches, add it to the single list.
"""
filter_column = self.PI_COLUMN
filter_value = pi_name
single, double = [], []
if org:
filter_column = self.ORG_COLUMN
filter_value = pi_org
name_match = filtered_df[filtered_df[filter_column] == filter_value]
if not name_match.empty:
for _, row in name_match.iterrows():
data = {
"Addgene Assigned": sf_row['Assigned'],
"Plasmid ID": row['Plasmid ID'],
"Deposit ID": row['Deposit ID'],
"Institute": row['Org Name'],
"PI Name": row['PI_Name'],
"Date Received": row[self.DP_DATE_COLUMN],
"Original Date": sf_row[self.SF_DATE_COLUMN],
"Original ORG": pi_org,
"Original PI": pi_name,
}
if (data['Institute'] == data['Original ORG']) and \
(data['PI Name'] == data['Original PI']):
double.append(data)
else:
single.append(data)
return single, double
def get_attribution_dataframes(self):
self.clean_dataframes()
name_matches = []
org_matches = []
double_matches = []
mismatches = []
# Iterate through the Salesforce report as the master document
for index, sf_row in self.salesforce_df.iterrows():
# Get a start date and an end date for filtering.
start_date = sf_row[self.SF_DATE_COLUMN]
end_date = start_date + relativedelta(months=self.months)
start = self.deposit_df[self.DP_DATE_COLUMN].searchsorted(start_date)[0]
end = self.deposit_df[self.DP_DATE_COLUMN].searchsorted(end_date)[0]
# Filter the deposit data to grab only things within that timeframe.
filtered_df = self.deposit_df.ix[start:end]
# Variables for short names, and not having to type index a lot.
pi_name = unicode(sf_row['First Name']) + " " + unicode(sf_row['Last Name'])
pi_org = sf_row['Account Name']
# Get matches by the PI's name
by_name, pi_by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org)
name_matches.extend(by_name)
mismatches.extend(by_name)
double_matches.extend(pi_by_both)
# Get matches by the organization name
by_org, org_by_both = self.get_filtered(filtered_df, sf_row, pi_name, pi_org, org=True)
org_matches.extend(by_org)
mismatches.extend(by_org)
double_matches.extend(org_by_both)
return (
("PI", pd.DataFrame(name_matches, columns=self.OUTPUT_COLUMN_ORDER)),
("Institute", pd.DataFrame(org_matches, columns=self.OUTPUT_COLUMN_ORDER)),
("Double", pd.DataFrame(double_matches, columns=self.OUTPUT_COLUMN_ORDER)),
("Single", pd.DataFrame(mismatches, columns=self.OUTPUT_COLUMN_ORDER))
)
def run(self):
self.frames = None
self.frames = self.get_attribution_dataframes()
def save(self):
for key, df in self.frames:
fname = '{0}_Attribution_Report_{1}_Match.xlsx'.format(datetime.date.today(), key)
output_path = os.path.join(self.output_dir, fname)
deduped_df = df.drop_duplicates()
with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
deduped_df.to_excel(writer, sheet_name='Sheet1', index=False)
if __name__ == '__main__':
app = AttributionReport(months=6, footer_length=6)
app.set_dataframe_deposit("/Users/tyrelsouza/Dropbox (Addgene)/Addgene Shared/Dev/Attribution Report/deposit_data.csv")
app.set_dataframe_sf("/Users/tyrelsouza/Dropbox (Addgene)/Addgene Shared/Dev/Attribution Report/salesforce_report.xlsx")
app.set_output_dir("/Users/tyrelsouza/Dropbox (Addgene)/Addgene Shared/Dev/Attribution Report/Output/")
app.run()
app.save()

170
main.py
View File

@ -1,170 +0,0 @@
import os
import json
import pandas as pd
import dropbox
from dateutil.relativedelta import relativedelta
DROPBOX = False
local_dropbox_path = "/Users/tyrelsouza/Dropbox (Addgene)/"
SF_DATE = "Date"
DP_DATE = "Date Received"
class AttributionReport(object):
def __init__(self, credentials_file, months=6, footer_length=None):
self.months = months
self.footer_length = footer_length
self.PI_COLUMN = "PI_Name"
self.ORG_COLUMN = "Org Name"
with open(credentials_file, "r") as cred_f:
creds = json.loads(cred_f.read())
self.app_key = creds['app_key']
self.app_secret = creds['app_secret']
if not creds.get("access_token", None):
self.authorize()
else:
self.access_token = creds['access_token']
self.user_id = creds['user_id']
def authorize(self):
flow = dropbox.client.DropboxOAuth2FlowNoRedirect(self.app_key, self.app_secret)
authorize_url = flow.start()
print '1. Go to: ' + authorize_url
print '2. Click "Allow" (you might have to log in first)'
print '3. Copy the authorization code.'
code = raw_input("Enter the authorization code here: ").strip()
access_token, user_id = flow.finish(code)
self.access_token = access_token
self.user_id = user_id
creds = {"app_key": self.app_key,
"app_secret": self.app_secret,
"access_token": self.access_token,
"user_id": self.user_id}
# Save so we don't have to do this again.
with open("credentials.json", "w") as f:
f.write(json.dumps(creds))
def _open_file_frame(self, filename, date_cols):
if DROPBOX:
client = dropbox.client.DropboxClient(self.access_token)
f = client.get_file(filename)
else:
f = os.path.normpath(local_dropbox_path + filename)
if filename[-4:] == ".csv":
df = pd.read_csv(f, parse_dates=date_cols, encoding='utf-8')
else:
df = pd.read_excel(f, parse_dates=date_cols, encoding='utf-8')
return df
def get_dataframes(self):
"""
This gets the Salesforce and the Deposit dataframes.
Then it does some cleanup of the columns
"""
salesforce_data_name = '/Addgene Shared/Dev/Attribution Report/salesforce_report.xlsx'
salesforce_df = self._open_file_frame(salesforce_data_name, date_cols=[4, 5])
if self.footer_length:
length_with_footer = len(salesforce_df.index)
salesforce_df = salesforce_df.head(length_with_footer - self.footer_length)
deposit_data_name = 'Addgene Shared/Dev/Attribution Report/deposit_data.csv'
deposit_df = self._open_file_frame(deposit_data_name, date_cols=[7, 8])
# Clean up Salesforce
salesforce_df['Account Description'].fillna('', inplace=True)
salesforce_df.sort(SF_DATE, ascending=1)
salesforce_df["Full Name"] = salesforce_df["First Name"].map(unicode) + " " + salesforce_df["Last Name"]
del salesforce_df["First Name"]
del salesforce_df["Last Name"]
# Cleanup Deposit Data
deposit_df['Org Name'].fillna('', inplace=True)
deposit_df.sort(DP_DATE, ascending=1)
deposit_df['PI_Name'].astype(unicode)
return salesforce_df, deposit_df
def get_filtered(self, filtered_df, sf_row, pi_name, pi_org, kind):
if kind == "PI":
filter_column = self.PI_COLUMN
filter_value = pi_name
elif kind == "ORG":
filter_column = self.ORG_COLUMN
filter_value = pi_org
name_match = filtered_df[filtered_df[filter_column] == filter_value]
output = []
if not name_match.empty:
for _, row in name_match.iterrows():
data = {
"Addgene Assigned": sf_row['Assigned'],
"Plasmid ID": row['Plasmid ID'],
"Deposit ID": row['Deposit ID'],
"Institute": row['Org Name'],
"PI Name": row['PI_Name'],
"Date Received": row[DP_DATE],
"Original Date": sf_row[SF_DATE],
"Original ORG": pi_org,
"Original PI": pi_name,
}
output.append(data)
return output
def get_attribution_dataframes(self):
salesforce, dep = self.get_dataframes()
name_matches = []
org_matches = []
# Iterate through the Salesforce report as the master document
for index, sf_row in salesforce.iterrows():
# Get a start date and an end date for filtering.
start_date = sf_row[SF_DATE]
end_date = start_date + relativedelta(months=self.months)
start = dep[DP_DATE].searchsorted(start_date)[0]
end = dep[DP_DATE].searchsorted(end_date)[0]
# Filter the deposit data to grab only things within that timeframe.
filtered_df = dep.ix[start:end]
# Variables for short names, and not having to type index a lot.
pi_name = unicode(sf_row['Full Name'])
pi_org = sf_row['Account Name']
# Get matches by the PI's name
by_name = self.get_filtered(filtered_df,
sf_row,
pi_name,
pi_org,
kind="PI")
name_matches.extend(by_name)
# Get matches by the organization name
by_org = self.get_filtered(filtered_df,
sf_row,
pi_name,
pi_org,
kind="ORG")
org_matches.extend(by_org)
return pd.DataFrame(name_matches), pd.DataFrame(org_matches)
def run(self):
name_df, org_df = self.get_attribution_dataframes()
name_df.to_excelv("names.xls")
org_df.to_excel("orgs.xls")
if __name__ == '__main__':
report = AttributionReport(credentials_file="credentials.json",
months=6,
footer_length=6)
report.run()

View File

@ -1,4 +1,4 @@
dropbox
pandas
xlrd
python-dateutil
XlsxWriter==0.7.3
pandas==0.16.2
python-dateutil==2.4.2
xlrd==0.9.4

25
utils.py Normal file
View File

@ -0,0 +1,25 @@
__author__ = 'tyrelsouza'
import os
import json
def get_dropbox_dir():
"""
Windows and Mac get dropox dir for Business or fallback to personal
"""
if os.name == "nt":
dropbox_file = os.path.join(os.getenv('APPDATA'), 'Dropbox', 'info.json')
else:
dropbox_file = os.path.expanduser("~/.dropbox/info.json")
with open(dropbox_file) as dbf:
dbconfig = json.loads(dbf.read())
if "business" in dbconfig:
dropbox_dir = dbconfig['business']['path']
elif "personal" in dbconfig:
dropbox_dir = dbconfig['personal']['path']
else:
dropbox_dir = os.path.expanduser("~")
return dropbox_dir