from bs4 import BeautifulSoup
import requests
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
from pprint import pprint
key = "redacted"
cat_ids = []
cat_names = []
url = 'https://www.googleapis.com/youtube/v3/videoCategories?part=snippet&hl=en®ionCode=US&key={}'.format(key)
r = requests.get(url).json()
for rid in r['items']:
cat_ids.append(rid['id'])
cat_names.append(rid['snippet']['title'])
cat_dict = dict(zip(cat_ids,cat_names))
#pprint(cat_dict)
max_npt = 200
cat_id = 22
ids = []
cats = []
for cat_id in cat_ids:
url = 'https://www.googleapis.com/youtube/v3/search?part=snippet&type=video&videoCategoryId={}&maxResults=50&key={}'.format(cat_id,key)
r = requests.get(url).json()
chans = []
for vid in r['items']:
if vid['kind'] == 'youtube#video':
ids.append(vid['snippet']['channelId'])
count = 0
while ('nextPageToken' in r) and (count < max_npt):
npt = r['nextPageToken']
url = 'https://www.googleapis.com/youtube/v3/search?part=snippet&type=video&videoCategoryId={}&maxResults=50&key={}&pageToken={}'.format(cat_id,key,npt)
r = requests.get(url).json()
count += 1
for vid in r['items']:
if vid['id']['kind'] == 'youtube#video':
chans.append(vid['snippet']['channelId'])
ids.extend(set(chans))
cats.extend([cat_id] * len(set(chans)))
print(cat_dict[cat_id]+ ': ' + str(len(set(chans))) + ' channels, '+str(len(chans)) + ' videos')
import YT_compile
import sqlite3 as db
import importlib
importlib.reload(YT_compile)
import YT_compile
cid = "UCs9cE8_vVtbqF1l5nuhYirw"
df = YT_compile.YT_vid_compile(user_id=cid)
df['channel'] = cid
conn = db.connect('YT.db')
try:
df.to_sql('videos',conn,if_exists='fail')
except:
print("DB already exists")
df_list = []
for idx,cid in enumerate(ids):
## Just an example, don't actually want to process
if idx > 8:
break
## Check DB for user
conn = db.connect('YT.db')
c = conn.cursor()
# Get the already processed channels from VIDEOS
try:
c.execute("SELECT DISTINCT channel FROM videos")
# If VIDEOS doesn't exist, make a new table VIDEOS
except:
print('Building new DB...')
df_list = []
print(cid)
cid = ids[0]
df_temp = YT_compile.YT_vid_compile(user_id=cid)
# Add features not included with YT_compile
df_temp['channel'] = cid
# Add to list
df_list.append(df_temp)
# Build YT.db if it doesn't already exist
conn = db.connect('YT.db')
df_list[0].to_sql('videos',conn,if_exists='fail')
c.execute("SELECT DISTINCT channel FROM videos")
# build a set with channel names
names = {name[0] for name in c.fetchall()}
if cid in names:
sys.stdout.write("\r" + str(idx+1) +"/"+str(len(ids))+ " already processed")
sys.stdout.flush()
else:
sys.stdout.write("\r" + str(idx+1) +"/"+str(len(ids))+ " processing ")
sys.stdout.flush()
# Compile all stats with YT_compile
### - function I created to containerize scraping
### - basically Scraping Categories fully fleshed out
df_temp = YT_compile.YT_vid_compile(user_id=cid)
if df_temp.empty:
print(cid)
# Add features not included with YT_compile
df_temp['channel'] = cid
df_temp['id'] = idx
# Add to list
df_list.append(df_temp)
# Add to DB
df_temp.to_sql('videos',conn,if_exists='append')
import sqlite3 as db
import pandas as pd
conn = db.connect('YT.db')
df = pd.read_sql('SELECT * FROM videos', con=conn)
df['publishedAt'] = pd.to_datetime(df['publishedAt'])
df = pd.concat(df_list)
# Aggregate
import matplotlib.pyplot as plt
import numpy as np
#df = pd.concat(df_list)
fig = plt.figure(figsize=(15,40))
count = 0
for label, did in df.groupby('channel'):
a = did.set_index('publishedAt').resample('M').mean().fillna(value=0)
count += 1
ax = plt.subplot(20,4,count)
# Plot resampled viewCounts
did.set_index('publishedAt').resample('M').sum().fillna(0).plot(y='viewCount',ax=ax,label=label,legend=False)
# Axis props
ax.set_xlim(pd.Timestamp('2012-01-01'), pd.Timestamp.now())
plt.xticks([])
plt.yticks([])
plt.xlabel('Date Published')
plt.ylabel('views')
plt.title('Channel #'+str(count))
plt.tight_layout()
#fun_? are applied functions so that we can threshold/aggregate/cohort CLEANLY
def fun_R(val,now):
'''returns TIME_ELAPSED since LAST video'''
td = now-val.max()
return td.days
def fun_F(val):
'''returns # of video postings'''
return val.count()
def fun_T(val,now):
'''returns TIME_ELAPSED since FIRST video'''
td = now-val.min()
return td.days
df_RFT = pd.DataFrame()
df_RFT['R'] = df.groupby('id').apply(lambda row: fun_R(row['publishedAt'],pd.Timestamp.now()))
df_RFT['F'] = df.groupby('id').apply(lambda row: fun_F(row['publishedAt']))
df_RFT['T'] = df.groupby('id').apply(lambda row: fun_T(row['publishedAt'],pd.Timestamp.now()))
df_RFT.head()
Work in progress. Currently trying to scrape enough data (3000+ channels) to run model without convergence.
Unfortunately I'm bottlenecked by Youtube API constraints so could take a couple of days
# import theano
import pymc3 as pm
import numpy as np
import matplotlib.pyplot as plt
from pymc3.math import exp, log
class ParetoNBD(pm.Continuous):
"""
Custom distribution class for Pareto/NBD likelihood.
"""
def __init__(self, lambda_, mu, *args, **kwargs):
super(ParetoNBD, self).__init__(*args, **kwargs)
self.lambda_ = lambda_
self.mu = mu
def logp(self, x, t_x, T):
"""
Loglikelihood function for and indvidual customer's purchasing rate \lambda
and lifetime \mu given their frequency, recency and time since first purchase.
"""
log_lambda = log(self.lambda_)
log_mu = log(self.mu)
mu_plus_lambda = self.lambda_ + self.mu
log_mu_plus_lambda = log(mu_plus_lambda)
p_1 = x * log_lambda + log_mu - log_mu_plus_lambda - t_x * mu_plus_lambda
p_2 = (x + 1) * log_lambda - log_mu_plus_lambda - T * mu_plus_lambda
return log(exp(p_1) + exp(p_2))
# Extract data for model following notation from Fader/Hardie
N = df_RFT.shape[0]
x = df_RFT['F'].values
t_x = df_RFT['R'].values
T = df_RFT['T'].values
n_draws = 2000
pnbd_model = pm.Model()
with pnbd_model:
r = pm.HalfCauchy('r', beta=2)
alpha = pm.HalfCauchy('alpha', beta=2)
s = pm.HalfCauchy('s', beta=2)
beta = pm.HalfCauchy('beta', beta=2)
# Gamma prior on purchasing rate parameter lambda
lambda_ = pm.Gamma('lambda', alpha=r, beta=alpha, shape=N, testval=np.random.rand(N))
# Gamma prior on lifetime parameter mu
mu = pm.Gamma('mu', alpha=s, beta=beta, shape=N, testval=np.random.rand(N))
# Custom distribution for Pareto-NBD likelihood function
loglikelihood = ParetoNBD("loglikelihood", mu=mu, lambda_=lambda_, observed={'x': x, 't_x': t_x, 'T': T})
# Sample the model
trace = pm.sample(n_draws, init=None)
def prob_alive_at_T(lambda_, mu, t_x, T):
den = 1 + (mu / (lambda_ + mu)) * (np.exp((lambda_ + mu) * (T - t_x)) - 1)
return 1 / den
def likelihood(lambda_, mu, x, t, T):
p1 = x * np.log(lambda_) + np.log(mu) - np.log(mu + lambda_) - t * (mu + lambda_)
p2 = (x + 1) * np.log(lambda_) - np.log(mu + lambda_) - T * (mu + lambda_)
return np.exp(p1) + np.exp(p2)
def predict(t, lambda_, mu, x, tx, T):
like = likelihood(lambda_, mu, x, tx, T)
p2 = lambda_ ** (x + 1) / mu * np.exp(-(lambda_ + mu) * T) * (1 - np.exp(-mu * t))
return 1 / like * p2