In [1]:
from bs4 import BeautifulSoup
import requests
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
from pprint import pprint

Scraping Categories

In [12]:
key = "redacted"
cat_ids = []
cat_names = []
url = 'https://www.googleapis.com/youtube/v3/videoCategories?part=snippet&hl=en&regionCode=US&key={}'.format(key)
r = requests.get(url).json()
for rid in r['items']:
    cat_ids.append(rid['id'])
    cat_names.append(rid['snippet']['title'])
    
cat_dict = dict(zip(cat_ids,cat_names))
#pprint(cat_dict)

Scraping Channels from Categories

In [13]:
max_npt = 200
cat_id = 22
ids = []
cats = []

for cat_id in cat_ids:
    url = 'https://www.googleapis.com/youtube/v3/search?part=snippet&type=video&videoCategoryId={}&maxResults=50&key={}'.format(cat_id,key)
    r = requests.get(url).json()
    
    chans = []
    
    for vid in r['items']:
        if vid['kind'] == 'youtube#video':
            ids.append(vid['snippet']['channelId'])
    count = 0
    while ('nextPageToken' in r) and (count < max_npt):
        npt = r['nextPageToken']
        url = 'https://www.googleapis.com/youtube/v3/search?part=snippet&type=video&videoCategoryId={}&maxResults=50&key={}&pageToken={}'.format(cat_id,key,npt)
        r = requests.get(url).json()
        count += 1
        for vid in r['items']:
            if vid['id']['kind'] == 'youtube#video':
                chans.append(vid['snippet']['channelId'])
    ids.extend(set(chans))
    cats.extend([cat_id] * len(set(chans)))
                
    print(cat_dict[cat_id]+ ': ' + str(len(set(chans))) + ' channels, '+str(len(chans)) + ' videos')
Film & Animation: 151 channels, 392 videos
Autos & Vehicles: 153 channels, 523 videos
Music: 212 channels, 304 videos
Pets & Animals: 104 channels, 487 videos
Sports: 146 channels, 434 videos
Short Movies: 0 channels, 0 videos
Travel & Events: 191 channels, 537 videos
Gaming: 214 channels, 483 videos
Videoblogging: 0 channels, 0 videos
People & Blogs: 293 channels, 507 videos
Comedy: 106 channels, 409 videos
Entertainment: 210 channels, 387 videos
News & Politics: 127 channels, 506 videos
Howto & Style: 207 channels, 481 videos
Education: 218 channels, 533 videos
Science & Technology: 241 channels, 519 videos
Nonprofits & Activism: 154 channels, 550 videos
Movies: 40 channels, 101 videos
Anime/Animation: 0 channels, 0 videos
Action/Adventure: 0 channels, 0 videos
Classics: 0 channels, 0 videos
Comedy: 5 channels, 8 videos
Documentary: 0 channels, 0 videos
Drama: 0 channels, 0 videos
Family: 0 channels, 0 videos
Foreign: 0 channels, 0 videos
Horror: 0 channels, 0 videos
Sci-Fi/Fantasy: 1 channels, 302 videos
Thriller: 0 channels, 0 videos
Shorts: 0 channels, 0 videos
Shows: 94 channels, 477 videos
Trailers: 156 channels, 491 videos

Instantiate Database

In [19]:
import YT_compile 
import sqlite3 as db

import importlib
importlib.reload(YT_compile)
import YT_compile 

cid = "UCs9cE8_vVtbqF1l5nuhYirw"
df = YT_compile.YT_vid_compile(user_id=cid)
df['channel'] = cid
conn = db.connect('YT.db')
try: 
    df.to_sql('videos',conn,if_exists='fail')
except:
    print("DB already exists")
Out[19]:
commentCount dislikeCount favoriteCount likeCount viewCount id publishedAt categoryId channel
0 0 1 0 2 201 WhWRadQK5MQ 2018-03-07 09:11:32 24 UCs9cE8_vVtbqF1l5nuhYirw
1 1568 138 0 14283 869125 0dEP_TebXaE 2012-01-23 03:55:35 1 UCs9cE8_vVtbqF1l5nuhYirw

Scraping Content from Channels

In [20]:
df_list = []
for idx,cid in enumerate(ids): 
    
    ## Just an example, don't actually want to process
    if idx > 8:
        break
        
    ## Check DB for user
    conn = db.connect('YT.db')
    c = conn.cursor()
    
    # Get the already processed channels from VIDEOS
    try:
        c.execute("SELECT DISTINCT channel FROM videos") 
    # If VIDEOS doesn't exist, make a new table VIDEOS
    except:
        print('Building new DB...')
        df_list = []
        print(cid)
        cid = ids[0]
        df_temp = YT_compile.YT_vid_compile(user_id=cid)
        
        # Add features not included with YT_compile
        df_temp['channel'] = cid

        # Add to list
        df_list.append(df_temp)

        # Build YT.db if it doesn't already exist
        conn = db.connect('YT.db')
        
        df_list[0].to_sql('videos',conn,if_exists='fail')
        c.execute("SELECT DISTINCT channel FROM videos") 
        
        
    # build a set with channel names
    names = {name[0] for name in c.fetchall()} 
    if cid in names:  
        sys.stdout.write("\r" + str(idx+1) +"/"+str(len(ids))+ " already processed")
        sys.stdout.flush()
    else:
        sys.stdout.write("\r" + str(idx+1) +"/"+str(len(ids))+ " processing       ")
        sys.stdout.flush()
    
        # Compile all stats with YT_compile
        ### - function I created to containerize scraping
        ### - basically Scraping Categories fully fleshed out
        df_temp = YT_compile.YT_vid_compile(user_id=cid)
       
        if df_temp.empty:
            print(cid)

        # Add features not included with YT_compile
        df_temp['channel'] = cid
        df_temp['id'] = idx

        # Add to list
        df_list.append(df_temp)

        # Add to DB
        df_temp.to_sql('videos',conn,if_exists='append')
            
9/3023 processing       
In [22]:
import sqlite3 as db
import pandas as pd
conn = db.connect('YT.db')
df = pd.read_sql('SELECT * FROM videos', con=conn)
df['publishedAt'] = pd.to_datetime(df['publishedAt'])
df = pd.concat(df_list)

Visualize ViewCounts over time

In [25]:
# Aggregate    

import matplotlib.pyplot as plt
import numpy as np

#df = pd.concat(df_list)
fig = plt.figure(figsize=(15,40))
count = 0
for label, did in df.groupby('channel'):
    
    a = did.set_index('publishedAt').resample('M').mean().fillna(value=0)

    count += 1
    ax = plt.subplot(20,4,count)
    
    # Plot resampled viewCounts
    did.set_index('publishedAt').resample('M').sum().fillna(0).plot(y='viewCount',ax=ax,label=label,legend=False)
    
    # Axis props
    ax.set_xlim(pd.Timestamp('2012-01-01'), pd.Timestamp.now())
    plt.xticks([])
    plt.yticks([])
    plt.xlabel('Date Published')
    plt.ylabel('views')
    plt.title('Channel #'+str(count))
    plt.tight_layout()
        

Generate Recency-Frequency Matrices

In [26]:
#fun_? are applied functions so that we can threshold/aggregate/cohort CLEANLY

def fun_R(val,now):
    '''returns TIME_ELAPSED since LAST video'''
    td = now-val.max()
    return td.days

def fun_F(val):
    '''returns # of video postings'''
    return val.count()

def fun_T(val,now):
    '''returns TIME_ELAPSED since FIRST video'''
    td = now-val.min()
    return td.days

df_RFT = pd.DataFrame()

df_RFT['R'] = df.groupby('id').apply(lambda row: fun_R(row['publishedAt'],pd.Timestamp.now()))
df_RFT['F'] = df.groupby('id').apply(lambda row: fun_F(row['publishedAt']))
df_RFT['T'] = df.groupby('id').apply(lambda row: fun_T(row['publishedAt'],pd.Timestamp.now()))
    
df_RFT.head()
Out[26]:
R F T
id
0 364 4 1277
1 0 173 1059
2 6 294 1901
3 25 44 3702
4 84 78 1771

Model Implementation

Work in progress. Currently trying to scrape enough data (3000+ channels) to run model without convergence.

Unfortunately I'm bottlenecked by Youtube API constraints so could take a couple of days

reference material

In [27]:
# import theano 

import pymc3 as pm
import numpy as np
import matplotlib.pyplot as plt
from pymc3.math import exp, log

class ParetoNBD(pm.Continuous):
    """
    Custom distribution class for Pareto/NBD likelihood.
    """
    
    def __init__(self, lambda_, mu, *args, **kwargs):
        super(ParetoNBD, self).__init__(*args, **kwargs)
        self.lambda_ = lambda_
        self.mu = mu
        
    def logp(self, x, t_x, T):
        """
        Loglikelihood function for and indvidual customer's purchasing rate \lambda
        and lifetime \mu given their frequency, recency and time since first purchase.
        """
        
        log_lambda = log(self.lambda_)
        log_mu = log(self.mu)
        mu_plus_lambda = self.lambda_ + self.mu
        log_mu_plus_lambda = log(mu_plus_lambda)
        
        p_1 = x * log_lambda + log_mu - log_mu_plus_lambda - t_x * mu_plus_lambda
        p_2 = (x + 1) * log_lambda - log_mu_plus_lambda - T * mu_plus_lambda
        
        return log(exp(p_1) + exp(p_2))
In [28]:
# Extract data for model following notation from Fader/Hardie
N = df_RFT.shape[0]
x = df_RFT['F'].values
t_x = df_RFT['R'].values
T = df_RFT['T'].values 

n_draws = 2000

pnbd_model = pm.Model()

with pnbd_model:

    r = pm.HalfCauchy('r', beta=2)
    alpha = pm.HalfCauchy('alpha', beta=2)
    s = pm.HalfCauchy('s', beta=2)
    beta = pm.HalfCauchy('beta', beta=2)
    
    # Gamma prior on purchasing rate parameter lambda
    lambda_ = pm.Gamma('lambda', alpha=r, beta=alpha, shape=N, testval=np.random.rand(N))
    # Gamma prior on lifetime parameter mu
    mu = pm.Gamma('mu', alpha=s, beta=beta, shape=N, testval=np.random.rand(N))

    # Custom distribution for Pareto-NBD likelihood function
    loglikelihood = ParetoNBD("loglikelihood", mu=mu, lambda_=lambda_, observed={'x': x, 't_x': t_x, 'T': T})
    
    # Sample the model
    trace = pm.sample(n_draws, init=None)
In [ ]:
def prob_alive_at_T(lambda_, mu, t_x, T):
    den = 1 + (mu / (lambda_ + mu)) * (np.exp((lambda_ + mu) * (T - t_x)) - 1)
    return 1 / den
In [ ]:
def likelihood(lambda_, mu, x, t, T):
    p1 = x * np.log(lambda_) + np.log(mu) - np.log(mu + lambda_) - t * (mu + lambda_)
    p2 = (x + 1) * np.log(lambda_) - np.log(mu + lambda_) - T * (mu + lambda_)
    return np.exp(p1) + np.exp(p2)

def predict(t, lambda_, mu, x, tx, T):
    like = likelihood(lambda_, mu, x, tx, T)
    p2 = lambda_ ** (x + 1) / mu * np.exp(-(lambda_ + mu) * T) * (1 - np.exp(-mu * t))

    return 1 / like * p2