import praw
import pandas as pd
from datetime import datetime
import time
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))


# Connect to DB
db_name = 'data/startingover.csv'
# db = pd.DataFrame() # for fresh start
db = pd.read_csv(db_name)
print('Connected to DB...')
print(db.shape)


# Extremely Confidential
sekrits = open('sekrit').read().split('\n')

# Connect to Reddit
reddit = praw.Reddit(
    client_id = sekrits[0],
    client_secret = sekrits[1],
    username = sekrits[2], # Optional
    password = sekrits[3], # Optional
    redirect_uri= 'http://localhost:8080',
    user_agent = 'totally_not_a_bot', # fool everyone
)
print('Connected to Reddit...')


# Grab everything from /r/all hot
print('Pulling...')
while True:
    pull = pd.DataFrame({\
       'author': post.author,
       # 'comments': post.comments, # takes really long, returns object
       'created_utc': post.created_utc,
       'distinguished': post.distinguished,
       'edited': post.edited,
       'id': post.id,
       'is_original_content': post.is_original_content,
       'is_self': post.is_self,
       'link_flair_text': post.link_flair_text,
       'locked': post.locked,
       'name': post.name,
       'num_comments': post.num_comments,
       'over_18': post.over_18,
       'permalink': post.permalink,
       'score': post.score,
       'selftext': post.selftext,
       'spoiler': post.spoiler,
       'stickied': post.stickied,
       'subreddit': post.subreddit,
       'title': post.title,
       'upvote_ratio': post.upvote_ratio,
       'url': post.url,
       'utc_now': datetime.utcnow().timestamp(),
       'post_age': (datetime.utcnow().timestamp()-post.created_utc) # Create age col
          } for post in reddit.subreddit('all').hot(limit=None))

    # add new list to BOTTOM of old list
    db = pd.concat([db,pull])
    # effectively update post record in place
    db = db.drop_duplicates('id',keep='last')
    # save
    db.to_csv(db_name, index=False)

    # stats
    total = db.shape[0]
    haul = pull.shape[0]
    print('Haul: ',pull.shape)
    print('Total:',db.shape)
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

    # wait
    print('Now wait...')
    time.sleep(600)

Anatomy of Scrapey!¶