From 276880d6fdb1c592e1b8fcd0e9006bab30be1daf Mon Sep 17 00:00:00 2001 From: Tyrel Souza Date: Sat, 17 Jan 2015 01:02:59 -0500 Subject: [PATCH] initial commit --- .gitignore | 110 +++++++++++++++++++++++++++++++++++++++++++++++ gather_latest.py | 20 +++++++++ get_new_hn.py | 68 +++++++++++++++++++++++++++++ requirements.txt | 2 + 4 files changed, 200 insertions(+) create mode 100644 .gitignore create mode 100644 gather_latest.py create mode 100644 get_new_hn.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a2fbedc --- /dev/null +++ b/.gitignore @@ -0,0 +1,110 @@ +# Created by https://www.gitignore.io + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml + +# Translations +*.mo +*.pot + +# Django stuff: +*.log + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Created by https://www.gitignore.io + +### PyCharm ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm + +*.iml + +## Directory-based project format: +.idea/ +# if you remove the above rule, at least ignore the following: + +# User-specific stuff: +# .idea/workspace.xml +# .idea/tasks.xml +# .idea/dictionaries + +# Sensitive or high-churn files: +# .idea/dataSources.ids +# .idea/dataSources.xml +# .idea/sqlDataSources.xml +# .idea/dynamic.xml +# .idea/uiDesigner.xml + +# Gradle: +# .idea/gradle.xml +# .idea/libraries + +# Mongo Explorer plugin: +# .idea/mongoSettings.xml + +## File-based project format: +*.ipr +*.iws + +## Plugin-specific files: + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties + + +*.sqlite3 diff --git a/gather_latest.py b/gather_latest.py new file mode 100644 index 0000000..482572d --- /dev/null +++ b/gather_latest.py @@ -0,0 +1,20 @@ +from get_new_hn import hacker_news +from pymongo import MongoClient +import datetime + + +def get_new_hackernews(): + client = MongoClient() + hn_data = hacker_news() + hn_db = client.githubs.hacker_news + + inserted = 0 + for hn in hn_data: + if not hn_db.find_one({'url': hn['url']}): + hn_db.insert(hn) + inserted += 1 + print datetime.datetime.now(), "Inserted", inserted + + +if __name__ == '__main__': + get_new_hackernews() \ No newline at end of file diff --git a/get_new_hn.py b/get_new_hn.py new file mode 100644 index 0000000..7917cbd --- /dev/null +++ b/get_new_hn.py @@ -0,0 +1,68 @@ +__author__ = 'tsouza' +import datetime +import requests +from pprint import pprint + +HN_URL = "http://hn.algolia.com/api/v1/search_by_date" + + +def top_of_the_hour(dt): + """ + Get the epoch time at the top of the current hour. + Do this so that I'll run at :01 past the hour to be safe. + """ + dt = datetime.datetime(dt.year, dt.month, dt.day, dt.hour, 0) + epoch = datetime.datetime.utcfromtimestamp(0) + delta = dt - epoch + return int(delta.total_seconds()) + + +def get_page(start_time, end_time, page=0): + """ + Get the data from hackernews a page from start to end times. + """ + params = { + 'tags': 'story', + 'numericFilters': 'created_at_i>{0},created_at_i<{1}'.format(start_time, end_time), + 'page': page, + 'hitsPerPage': 50 + } + body = requests.get(HN_URL, params=params).json() + return body + + +def get_githubs(end_time, start_time): + """ + Get all the github links for the time range. + """ + # get the first page + page = get_page(start_time, end_time) + hits = page['hits'] + # if more than one page, get the rest of them + if page['nbPages'] > 1: + for page_number in xrange(1, page['nbPages']): + page = get_page(start_time, end_time, page_number) + hits.extend(page['hits']) + + # Strip out all non github links. + githubs = [hit for hit in hits if "github.com" in hit['url']] + return githubs + + +def strip_fields(github): + """ + Get rid of the unneeded fields. + """ + return dict( + source="hackernews", + author=github['author'], + created_at=github['created_at_i'], + url=github['url'], + title=github['title'] + ) + + +def hacker_news(hours=1): + end_time = top_of_the_hour(datetime.datetime.now()) + start_time = end_time - ((60 * 60) * hours) + return [strip_fields(github) for github in get_githubs(end_time, start_time)] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..eae0f08 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +requests +pymongo