github_hackernews_reddit_stars/get_new_hn.py

__author__ = 'tsouza'
import datetime
import requests
from pprint import pprint

HN_URL = "http://hn.algolia.com/api/v1/search_by_date"


def top_of_the_hour(dt):
    """
        Get the epoch time at the top of the current hour.
        Do this so that I'll run at :01 past the hour to be safe.
    """
    dt = datetime.datetime(dt.year, dt.month, dt.day, dt.hour, 0)
    epoch = datetime.datetime.utcfromtimestamp(0)
    delta = dt - epoch
    return int(delta.total_seconds())


def get_page(start_time, end_time, page=0):
    """
        Get the data from hackernews a page from start to end times.
    """
    params = {
        'tags': 'story',
        'numericFilters': 'created_at_i>{0},created_at_i<{1}'.format(start_time, end_time),
        'page': page,
        'hitsPerPage': 50
    }
    body = requests.get(HN_URL, params=params).json()
    return body


def get_githubs(end_time, start_time):
    """
        Get all the github links for the time range.
    """
    # get the first page
    page = get_page(start_time, end_time)
    hits = page['hits']
    # if more than one page, get the rest of them
    if page['nbPages'] > 1:
        for page_number in xrange(1, page['nbPages']):
            page = get_page(start_time, end_time, page_number)
            hits.extend(page['hits'])

    # Strip out all non github links.
    githubs = [hit for hit in hits if "github.com" in hit['url']]
    return githubs


def strip_fields(github):
    """
        Get rid of the unneeded fields.
    """
    return dict(
        source="hackernews",
        author=github['author'],
        created_at=github['created_at_i'],
        url=github['url'],
        title=github['title']
    )


def hacker_news(hours=1):
    end_time = top_of_the_hour(datetime.datetime.now())
    start_time = end_time - ((60 * 60) * hours)
    return [strip_fields(github) for github in get_githubs(end_time, start_time)]