initial commit

2015-01-17 01:02:59 -05:00 · 2015-01-17 01:02:59 -05:00 · 276880d6fd
commit 276880d6fd
4 changed files with 200 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,110 @@
+# Created by https://www.gitignore.io
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Created by https://www.gitignore.io
+
+### PyCharm ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm
+
+*.iml
+
+## Directory-based project format:
+.idea/
+# if you remove the above rule, at least ignore the following:
+
+# User-specific stuff:
+# .idea/workspace.xml
+# .idea/tasks.xml
+# .idea/dictionaries
+
+# Sensitive or high-churn files:
+# .idea/dataSources.ids
+# .idea/dataSources.xml
+# .idea/sqlDataSources.xml
+# .idea/dynamic.xml
+# .idea/uiDesigner.xml
+
+# Gradle:
+# .idea/gradle.xml
+# .idea/libraries
+
+# Mongo Explorer plugin:
+# .idea/mongoSettings.xml
+
+## File-based project format:
+*.ipr
+*.iws
+
+## Plugin-specific files:
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+
+
+*.sqlite3
--- a/gather_latest.py
+++ b/gather_latest.py
@ -0,0 +1,20 @@
+from get_new_hn import hacker_news
+from pymongo import MongoClient
+import datetime
+
+
+def get_new_hackernews():
+    client = MongoClient()
+    hn_data = hacker_news()
+    hn_db = client.githubs.hacker_news
+
+    inserted = 0
+    for hn in hn_data:
+        if not hn_db.find_one({'url': hn['url']}):
+            hn_db.insert(hn)
+            inserted += 1
+    print datetime.datetime.now(), "Inserted", inserted
+
+
+if __name__ == '__main__':
+    get_new_hackernews()
--- a/get_new_hn.py
+++ b/get_new_hn.py
@ -0,0 +1,68 @@
+__author__ = 'tsouza'
+import datetime
+import requests
+from pprint import pprint
+
+HN_URL = "http://hn.algolia.com/api/v1/search_by_date"
+
+
+def top_of_the_hour(dt):
+    """
+        Get the epoch time at the top of the current hour.
+        Do this so that I'll run at :01 past the hour to be safe.
+    """
+    dt = datetime.datetime(dt.year, dt.month, dt.day, dt.hour, 0)
+    epoch = datetime.datetime.utcfromtimestamp(0)
+    delta = dt - epoch
+    return int(delta.total_seconds())
+
+
+def get_page(start_time, end_time, page=0):
+    """
+        Get the data from hackernews a page from start to end times.
+    """
+    params = {
+        'tags': 'story',
+        'numericFilters': 'created_at_i>{0},created_at_i<{1}'.format(start_time, end_time),
+        'page': page,
+        'hitsPerPage': 50
+    }
+    body = requests.get(HN_URL, params=params).json()
+    return body
+
+
+def get_githubs(end_time, start_time):
+    """
+        Get all the github links for the time range.
+    """
+    # get the first page
+    page = get_page(start_time, end_time)
+    hits = page['hits']
+    # if more than one page, get the rest of them
+    if page['nbPages'] > 1:
+        for page_number in xrange(1, page['nbPages']):
+            page = get_page(start_time, end_time, page_number)
+            hits.extend(page['hits'])
+
+    # Strip out all non github links.
+    githubs = [hit for hit in hits if "github.com" in hit['url']]
+    return githubs
+
+
+def strip_fields(github):
+    """
+        Get rid of the unneeded fields.
+    """
+    return dict(
+        source="hackernews",
+        author=github['author'],
+        created_at=github['created_at_i'],
+        url=github['url'],
+        title=github['title']
+    )
+
+
+def hacker_news(hours=1):
+    end_time = top_of_the_hour(datetime.datetime.now())
+    start_time = end_time - ((60 * 60) * hours)
+    return [strip_fields(github) for github in get_githubs(end_time, start_time)]
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
+requests
+pymongo