initial commit

This commit is contained in:
Tyrel Souza 2015-01-17 01:02:59 -05:00
commit 276880d6fd
4 changed files with 200 additions and 0 deletions

110
.gitignore vendored Normal file
View File

@ -0,0 +1,110 @@
# Created by https://www.gitignore.io
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.cache
nosetests.xml
coverage.xml
# Translations
*.mo
*.pot
# Django stuff:
*.log
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Created by https://www.gitignore.io
### PyCharm ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm
*.iml
## Directory-based project format:
.idea/
# if you remove the above rule, at least ignore the following:
# User-specific stuff:
# .idea/workspace.xml
# .idea/tasks.xml
# .idea/dictionaries
# Sensitive or high-churn files:
# .idea/dataSources.ids
# .idea/dataSources.xml
# .idea/sqlDataSources.xml
# .idea/dynamic.xml
# .idea/uiDesigner.xml
# Gradle:
# .idea/gradle.xml
# .idea/libraries
# Mongo Explorer plugin:
# .idea/mongoSettings.xml
## File-based project format:
*.ipr
*.iws
## Plugin-specific files:
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
*.sqlite3

20
gather_latest.py Normal file
View File

@ -0,0 +1,20 @@
from get_new_hn import hacker_news
from pymongo import MongoClient
import datetime
def get_new_hackernews():
client = MongoClient()
hn_data = hacker_news()
hn_db = client.githubs.hacker_news
inserted = 0
for hn in hn_data:
if not hn_db.find_one({'url': hn['url']}):
hn_db.insert(hn)
inserted += 1
print datetime.datetime.now(), "Inserted", inserted
if __name__ == '__main__':
get_new_hackernews()

68
get_new_hn.py Normal file
View File

@ -0,0 +1,68 @@
__author__ = 'tsouza'
import datetime
import requests
from pprint import pprint
HN_URL = "http://hn.algolia.com/api/v1/search_by_date"
def top_of_the_hour(dt):
"""
Get the epoch time at the top of the current hour.
Do this so that I'll run at :01 past the hour to be safe.
"""
dt = datetime.datetime(dt.year, dt.month, dt.day, dt.hour, 0)
epoch = datetime.datetime.utcfromtimestamp(0)
delta = dt - epoch
return int(delta.total_seconds())
def get_page(start_time, end_time, page=0):
"""
Get the data from hackernews a page from start to end times.
"""
params = {
'tags': 'story',
'numericFilters': 'created_at_i>{0},created_at_i<{1}'.format(start_time, end_time),
'page': page,
'hitsPerPage': 50
}
body = requests.get(HN_URL, params=params).json()
return body
def get_githubs(end_time, start_time):
"""
Get all the github links for the time range.
"""
# get the first page
page = get_page(start_time, end_time)
hits = page['hits']
# if more than one page, get the rest of them
if page['nbPages'] > 1:
for page_number in xrange(1, page['nbPages']):
page = get_page(start_time, end_time, page_number)
hits.extend(page['hits'])
# Strip out all non github links.
githubs = [hit for hit in hits if "github.com" in hit['url']]
return githubs
def strip_fields(github):
"""
Get rid of the unneeded fields.
"""
return dict(
source="hackernews",
author=github['author'],
created_at=github['created_at_i'],
url=github['url'],
title=github['title']
)
def hacker_news(hours=1):
end_time = top_of_the_hour(datetime.datetime.now())
start_time = end_time - ((60 * 60) * hours)
return [strip_fields(github) for github in get_githubs(end_time, start_time)]

2
requirements.txt Normal file
View File

@ -0,0 +1,2 @@
requests
pymongo