import pandas as pd

cnn = pd.read_feather('../data/CNN_sentence_embeddings.feather')
fox = pd.read_feather('../data/FoxNews_sentence_embeddings.feather')
ap = pd.read_feather('../data/AP_sentence_embeddings.feather')


# For each df, only the first n sentences are used
num_sentences = 50000
cnn = cnn.iloc[:num_sentences]
fox = fox.iloc[:num_sentences]
ap = ap.iloc[:num_sentences]


# Concatenate all three into a single data frame
df = pd.concat([cnn, fox, ap], ignore_index=True)


from datetime import datetime

# Only take in articles that are some number of days old
df['Date'] = [i.replace('T', ' ') for i in df['Date']]
df['Date'] = [i.replace('Z', '') for i in df['Date']] 
df['Date'] = [datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in df['Date']]
df['Time_delta'] = [df['Date'][0] - i for i in df['Date']]
df['Time_delta'] = [i.days for i in df['Time_delta']]
df['Date'] = [str(i) for i in df['Date']]
df = df[df['Time_delta'] < 60].reset_index(drop=True)


# Calculate the max time deltas for each news User 
max_time_delta = df.groupby('User')['Time_delta'].max().reset_index()
display(max_time_delta)

# Tabulate each User
user_tab = df.groupby('User')['Time_delta'].count().reset_index()
display(user_tab)


np.random.seed(42)

# Randomly sample each User down to the same number of posts of the minum number of posts for a User
min_posts = min(df['User'].value_counts())
df = df.groupby('User').apply(lambda x: x.sample(min_posts)).reset_index(drop=True)

# Calculate the max time deltas for each news User 
max_time_delta = df.groupby('User')['Time_delta'].max().reset_index()
display(max_time_delta)

# Tabulate each User
user_tab = df.groupby('User')['Time_delta'].count().reset_index()
user_tab


# Return only the columns named 0 through 768 (the embedding columns)
em = df.iloc[:, 0:768]
display(em.shape)

(13005, 768)


import numpy as np
import pandas as pd
from annoy import AnnoyIndex

# Prepare the Annoy index
def build_annoy_index(em, num_trees=10):
    num_items, embedding_size = em.shape
    t = AnnoyIndex(embedding_size, 'angular')
    
    for i in range(num_items):
        t.add_item(i, em.iloc[i, :])
    
    t.build(num_trees)
    return t

# Get the KNNs for each item in the index
def get_knns_for_all(index, k, num_items):
    knns = [index.get_nns_by_item(i, k) for i in range(num_items)]
    return knns

# Build Annoy index
annoy_index = build_annoy_index(em)

# Get the KNNs for each item
k = 20
num_items = em.shape[0]
knn = get_knns_for_all(annoy_index, k, num_items)


# Pull out the columns of the df that are NOT in em
df = df.iloc[:, 768:]


# For the indices in question, get the corresponding column called User in df
def get_users_for_knns(knn, df):
    return df.iloc[knn, df.columns.get_loc('User')]

# Get the users for each KNN
users = [get_users_for_knns(knn[i], df) for i in range(num_items)]

# Visualize the first 3
users[0:3]

[0             AP
 2181          AP
 281           AP
 12833    FoxNews
 3722          AP
 6051         CNN
 8948     FoxNews
 694           AP
 2451          AP
 11763    FoxNews
 560           AP
 5065         CNN
 10942    FoxNews
 989           AP
 713           AP
 662           AP
 1247          AP
 2084          AP
 8966     FoxNews
 8976     FoxNews
 Name: User, dtype: object,
 1             AP
 4788         CNN
 1715          AP
 5001         CNN
 5825         CNN
 7594         CNN
 2939          AP
 1534          AP
 1783          AP
 2081          AP
 1635          AP
 3572          AP
 1115          AP
 10772    FoxNews
 7584         CNN
 3140          AP
 2442          AP
 1697          AP
 1671          AP
 5888         CNN
 Name: User, dtype: object,
 2        AP
 3240     AP
 6195    CNN
 235      AP
 8455    CNN
 1315     AP
 4145     AP
 3374     AP
 350      AP
 103      AP
 4138     AP
 1931     AP
 1242     AP
 1991     AP
 5546    CNN
 6366    CNN
 8221    CNN
 3482     AP
 7880    CNN
 6718    CNN
 Name: User, dtype: object]


# For each KNN, get the Shannon Entropy of the set of Users
def get_entropy_for_knn(users):
    return pd.Series(users).value_counts(normalize=True).apply(lambda p: -p*np.log(p)).sum()

# Get the entropy for each KNN
entropy = [get_entropy_for_knn(users[i]) for i in range(num_items)]

# Add the entropy to the df
df['knn_entropy'] = entropy


# Get per-knn fraction CNN or any user
def get_fraction_user_for_knn(users, user='CNN'):
    value_counts = users.value_counts(normalize=True)
    return value_counts.get(user, 0) 

# Get the fraction CNN for each KNN
fraction_cnn = [get_fraction_user_for_knn(users[i], user = 'CNN') for i in range(num_items)]
fraction_fox = [get_fraction_user_for_knn(users[i], user='FoxNews') for i in range(num_items)]
fraction_ap = [get_fraction_user_for_knn(users[i], user='AP') for i in range(num_items)]


# Add each of these to df
df['knn_fraction_cnn'] = fraction_cnn
df['knn_fraction_fox'] = fraction_fox
df['knn_fraction_ap'] = fraction_ap


# Sort dataframe by entropy
df = df.sort_values(by='knn_entropy', ascending=False)
df


import umap
np.random.seed(42)

# Run umap on the embeddings
umap = umap.UMAP(densmap = True, n_components=2, random_state=42).fit_transform(em)

# Name the columns umap1 and umap2
umap = pd.DataFrame(umap, columns=['umap1', 'umap2'])

# Combine with the df
df = pd.concat([df, umap], axis=1)


# Drop NaN values from Tweet
# df = df.dropna(subset=['Tweet']) It appears that the NAs come from the sort by time-delta
df.Tweet # Debug

5373     On Friday night, a new national record for low...
3544     BREAKING: Police say the suspect in the fatal ...
4424     CNN also identified more than 25 mariners who ...
11677    Melissa Rivers shares do's and don'ts of count...
3364     The war in Ukraine has created a surge in dema...
                               ...                        
5960     .@EvaLongoria’s culinary journey through Mexic...
5961     Here are 5️⃣ things you need to know today:\n\...
7819     .@EvaLongoria is coming to CNN in the new CNN ...
6764     Can you name a more iconic duo than chocolate ...
7881     Dutch cheese is the secret to this beloved Mex...
Name: Tweet, Length: 13005, dtype: object


df['Tweet'] = [i.split('http')[0] for i in df['Tweet']]
df['Tweet'] = df['Tweet'].str.wrap(30)
df['Tweet'] = df['Tweet'].apply(lambda x: x.replace('\n', '<br>')) # bug


import plotly.express as px

# You need this to the plots will visualize in a rendered html page
import plotly.io as pio
pio.renderers.default = "notebook_connected"

# Do a plotly scatter and color by entropy
fig = px.scatter(df, x='umap1', y='umap2', color='knn_entropy', size_max = 5, hover_data=['User', 'Tweet', 'Likes', 'knn_entropy'])
fig.show()

fig = px.scatter(df, x='umap1', y='umap2', color='User', size_max = 5, hover_data=['User', 'Tweet', 'Likes', 'knn_entropy'])
fig.show()

# Color by fraction CNN, Fox, or AP
fig = px.scatter(df, x='umap1', y='umap2', color='knn_fraction_cnn', size_max = 5, hover_data=['User', 'Tweet', 'Likes', 'knn_entropy'])
fig.show()

fig = px.scatter(df, x='umap1', y='umap2', color='knn_fraction_fox', size_max = 5, hover_data=['User', 'Tweet', 'Likes', 'knn_entropy'])
fig.show()

fig = px.scatter(df, x='umap1', y='umap2', color='knn_fraction_ap', size_max = 5, hover_data=['User', 'Tweet', 'Likes', 'knn_entropy'])
fig.show()


# Make box plots of the entropy for each user, include points
fig = px.box(df, x='User', y='knn_entropy')
fig.show()


# Calculate the mean and standard deviation of the entropy for each user
df.groupby('User').agg({'knn_entropy': ['mean', 'std']})

	Url	Date	ID	ConversationID	Language	Source	User	Likes	Retweets	Replies	...	TCooutlinks	RetweetedTweet	QuotedTweet	MentionedUsers	Unnamed: 0	Time_delta	knn_entropy	knn_fraction_cnn	knn_fraction_fox	knn_fraction_ap
5373	https://twitter.com/CNN/status/162196936511234...	2023-02-04 20:30:06	1.621969e+18	1.621969e+18	en	<a href="http://www.socialflow.com" rel="nofol...	CNN	181	47	41	...	['https://t.co/UIRpxEe4pT']	NaN	NaN	None	NaN	58	1.096067	0.35	0.30	0.35
3544	https://twitter.com/AP/status/1625369082898808833	2023-02-14 05:39:21	1.625369e+18	1.625325e+18	en	<a href="http://www.socialflow.com" rel="nofol...	AP	246	159	24	...	['https://t.co/MgxFoigPt4']	NaN	None	None	NaN	48	1.096067	0.35	0.30	0.35
4424	https://twitter.com/CNN/status/163705758350353...	2023-03-18 11:45:17	1.637058e+18	1.637055e+18	en	<a href="https://about.twitter.com/products/tw...	CNN	124	29	12	...	[]	NaN	NaN	None	NaN	16	1.096067	0.35	0.30	0.35
11677	https://twitter.com/FoxNews/status/16427236576...	2023-04-03 03:00:14	1.642724e+18	1.642724e+18	en	<a href="http://www.socialflow.com" rel="nofol...	FoxNews	46	5	19	...	['https://t.co/0RzFhkYkwK']	NaN	NaN	None	NaN	0	1.096067	0.35	0.35	0.30
3364	https://twitter.com/AP/status/1632874963605528576	2023-03-06 22:45:03	1.632875e+18	1.632875e+18	en	<a href="https://trueanthem.com/" rel="nofollo...	AP	250	62	23	...	['https://t.co/SdHqIe9y2Q']	NaN	None	None	NaN	28	1.096067	0.35	0.30	0.35
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5960	https://twitter.com/CNN/status/164176480358366...	2023-03-31 11:30:06	1.641765e+18	1.641765e+18	en	<a href="http://www.socialflow.com" rel="nofol...	CNN	139	23	18	...	[]	NaN	NaN	[User(username='EvaLongoria', id=110827653, di...	NaN	3	0.000000	1.00	0.00	0.00
5961	https://twitter.com/CNN/status/162547622622264...	2023-02-14 12:45:06	1.625476e+18	1.625476e+18	en	<a href="http://www.socialflow.com" rel="nofol...	CNN	197	44	195	...	['https://t.co/TOj1UROo9b']	NaN	NaN	None	NaN	48	0.000000	1.00	0.00	0.00
7819	https://twitter.com/CNN/status/163783515809395...	2023-03-20 15:15:05	1.637835e+18	1.637835e+18	en	<a href="http://www.socialflow.com" rel="nofol...	CNN	245	49	40	...	[]	NaN	NaN	[User(username='EvaLongoria', id=110827653, di...	NaN	14	0.000000	1.00	0.00	0.00
6764	https://twitter.com/CNN/status/163894877786329...	2023-03-23 17:00:13	1.638949e+18	1.638949e+18	en	<a href="http://www.socialflow.com" rel="nofol...	CNN	298	51	35	...	[]	NaN	NaN	[User(username='EvaLongoria', id=110827653, di...	NaN	11	0.000000	1.00	0.00	0.00
7881	https://twitter.com/CNN/status/164270098714289...	2023-04-03 01:30:09	1.642701e+18	1.642701e+18	en	<a href="http://www.socialflow.com" rel="nofol...	CNN	207	39	24	...	[]	NaN	NaN	None	NaN	0	0.000000	1.00	0.00	0.00

CNN vs FoxNews vs AP: News Space¶

Summary¶

Embeddings¶

If you do not know how to code¶

	knn_entropy
	mean	std
User
AP	0.845361	0.207332
CNN	0.850937	0.225392
FoxNews	0.805041	0.255270