stuff
This commit is contained in:
38
15-reddit-scraper/README.md
Normal file
38
15-reddit-scraper/README.md
Normal file
@ -0,0 +1,38 @@
|
||||
Reddit Miner
|
||||
============
|
||||
|
||||
Now that you've completed mining coinmarketcap for data, you've seen that scraping data using HTML can be kind of messy. If they decide to change an id of a div or something, we're screwed. It's extremely useful, but not an exact science by any stretch.
|
||||
|
||||
Now we're going to move on to reddit. Reddit is a beast. It's huge and constantly changing. Thank doge that they will serve us json.
|
||||
|
||||
What does that mean? Well go to http://www.reddit.com/.json and take a look.
|
||||
|
||||
The front page is one massive json object. In fact, every page on reddit is.
|
||||
|
||||
Let's parse that beast.
|
||||
|
||||
#### Getting the JSON
|
||||
|
||||
Using requests, get that big json file and load it.
|
||||
|
||||
#### Our database
|
||||
|
||||
We're going to build an app that saves every front page post into the database when it is run. For each post, we'll want its title, author, url, subreddit it was posted in, number of upvotes, and the datetime it was posted on Reddit.
|
||||
|
||||
Create the necessary table using the django ORM.
|
||||
|
||||
#### Getting the correct data
|
||||
|
||||
That json return is a little scary. It's huge and complex. How can you search it in the terminal to figure out what fields you need?
|
||||
|
||||
Access the correct fields in the object and save them to your db. We don't want duplicate entries - ensure uniqueness in a way that makes sense.
|
||||
|
||||
If you run your program again and an entry already exists, update it so we get the most recent score.
|
||||
|
||||
#### Indexing
|
||||
|
||||
Whatever column you decide to use to ensure uniqueness, we'll want to add an [index](http://en.wikipedia.org/wiki/Database_index) to this field because we're going to be searching it alot. What is the big o when searching by index? What is it without one?
|
||||
|
||||
#### The final product
|
||||
|
||||
Create a function that just pulls a random row from your reddit frontpage database and prints it to the screen. Since you've got the crontab running, you should soon have a huge database of the top reddit posts and links. In 6 months, repost and reap those sweet sweet internet points. Oh yeah!
|
10
15-reddit-scraper/reddit_project/manage.py
Normal file
10
15-reddit-scraper/reddit_project/manage.py
Normal file
@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env python
|
||||
import os
|
||||
import sys
|
||||
|
||||
if __name__ == "__main__":
|
||||
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "reddit_project.settings")
|
||||
|
||||
from django.core.management import execute_from_command_line
|
||||
|
||||
execute_from_command_line(sys.argv)
|
83
15-reddit-scraper/reddit_project/reddit_project/settings.py
Normal file
83
15-reddit-scraper/reddit_project/reddit_project/settings.py
Normal file
@ -0,0 +1,83 @@
|
||||
"""
|
||||
Django settings for reddit_project project.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/1.7/topics/settings/
|
||||
|
||||
For the full list of settings and their values, see
|
||||
https://docs.djangoproject.com/en/1.7/ref/settings/
|
||||
"""
|
||||
|
||||
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
|
||||
import os
|
||||
BASE_DIR = os.path.dirname(os.path.dirname(__file__))
|
||||
|
||||
|
||||
# Quick-start development settings - unsuitable for production
|
||||
# See https://docs.djangoproject.com/en/1.7/howto/deployment/checklist/
|
||||
|
||||
# SECURITY WARNING: keep the secret key used in production secret!
|
||||
SECRET_KEY = '-820u&ao04y97d)%!s@%k01*4+-f&!+i9$&k!6%ddt7osm72gk'
|
||||
|
||||
# SECURITY WARNING: don't run with debug turned on in production!
|
||||
DEBUG = True
|
||||
|
||||
TEMPLATE_DEBUG = True
|
||||
|
||||
ALLOWED_HOSTS = []
|
||||
|
||||
|
||||
# Application definition
|
||||
|
||||
INSTALLED_APPS = (
|
||||
'django.contrib.admin',
|
||||
'django.contrib.auth',
|
||||
'django.contrib.contenttypes',
|
||||
'django.contrib.sessions',
|
||||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
)
|
||||
|
||||
MIDDLEWARE_CLASSES = (
|
||||
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||
'django.middleware.common.CommonMiddleware',
|
||||
'django.middleware.csrf.CsrfViewMiddleware',
|
||||
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
||||
'django.contrib.auth.middleware.SessionAuthenticationMiddleware',
|
||||
'django.contrib.messages.middleware.MessageMiddleware',
|
||||
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
||||
)
|
||||
|
||||
ROOT_URLCONF = 'reddit_project.urls'
|
||||
|
||||
WSGI_APPLICATION = 'reddit_project.wsgi.application'
|
||||
|
||||
|
||||
# Database
|
||||
# https://docs.djangoproject.com/en/1.7/ref/settings/#databases
|
||||
|
||||
DATABASES = {
|
||||
'default': {
|
||||
'ENGINE': 'django.db.backends.sqlite3',
|
||||
'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
|
||||
}
|
||||
}
|
||||
|
||||
# Internationalization
|
||||
# https://docs.djangoproject.com/en/1.7/topics/i18n/
|
||||
|
||||
LANGUAGE_CODE = 'en-us'
|
||||
|
||||
TIME_ZONE = 'UTC'
|
||||
|
||||
USE_I18N = True
|
||||
|
||||
USE_L10N = True
|
||||
|
||||
USE_TZ = True
|
||||
|
||||
|
||||
# Static files (CSS, JavaScript, Images)
|
||||
# https://docs.djangoproject.com/en/1.7/howto/static-files/
|
||||
|
||||
STATIC_URL = '/static/'
|
10
15-reddit-scraper/reddit_project/reddit_project/urls.py
Normal file
10
15-reddit-scraper/reddit_project/reddit_project/urls.py
Normal file
@ -0,0 +1,10 @@
|
||||
from django.conf.urls import patterns, include, url
|
||||
from django.contrib import admin
|
||||
|
||||
urlpatterns = patterns('',
|
||||
# Examples:
|
||||
# url(r'^$', 'reddit_project.views.home', name='home'),
|
||||
# url(r'^blog/', include('blog.urls')),
|
||||
|
||||
url(r'^admin/', include(admin.site.urls)),
|
||||
)
|
14
15-reddit-scraper/reddit_project/reddit_project/wsgi.py
Normal file
14
15-reddit-scraper/reddit_project/reddit_project/wsgi.py
Normal file
@ -0,0 +1,14 @@
|
||||
"""
|
||||
WSGI config for reddit_project project.
|
||||
|
||||
It exposes the WSGI callable as a module-level variable named ``application``.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/1.7/howto/deployment/wsgi/
|
||||
"""
|
||||
|
||||
import os
|
||||
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "reddit_project.settings")
|
||||
|
||||
from django.core.wsgi import get_wsgi_application
|
||||
application = get_wsgi_application()
|
3
15-reddit-scraper/reddit_project/scraper/admin.py
Normal file
3
15-reddit-scraper/reddit_project/scraper/admin.py
Normal file
@ -0,0 +1,3 @@
|
||||
from django.contrib import admin
|
||||
|
||||
# Register your models here.
|
3
15-reddit-scraper/reddit_project/scraper/models.py
Normal file
3
15-reddit-scraper/reddit_project/scraper/models.py
Normal file
@ -0,0 +1,3 @@
|
||||
from django.db import models
|
||||
|
||||
# Create your models here.
|
3
15-reddit-scraper/reddit_project/scraper/tests.py
Normal file
3
15-reddit-scraper/reddit_project/scraper/tests.py
Normal file
@ -0,0 +1,3 @@
|
||||
from django.test import TestCase
|
||||
|
||||
# Create your tests here.
|
3
15-reddit-scraper/reddit_project/scraper/views.py
Normal file
3
15-reddit-scraper/reddit_project/scraper/views.py
Normal file
@ -0,0 +1,3 @@
|
||||
from django.shortcuts import render
|
||||
|
||||
# Create your views here.
|
Reference in New Issue
Block a user