In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import stats
In [2]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In [3]:
pd.set_option('display.max_columns',None)
In [4]:
US_comments = pd.read_csv('UScomments.csv', error_bad_lines = False)
b'Skipping line 41589: expected 4 fields, saw 11\nSkipping line 51628: expected 4 fields, saw 7\nSkipping line 114465: expected 4 fields, saw 5\n'
b'Skipping line 142496: expected 4 fields, saw 8\nSkipping line 189732: expected 4 fields, saw 6\nSkipping line 245218: expected 4 fields, saw 7\n'
b'Skipping line 388430: expected 4 fields, saw 5\n'
In [5]:
US_comments.head()
Out[5]:
video_id comment_text likes replies
0 XpVt6Z1Gjjo Logan Paul it's yo big day ‼️‼️‼️ 4 0
1 XpVt6Z1Gjjo I've been following you from the start of your... 3 0
2 XpVt6Z1Gjjo Say hi to Kong and maverick for me 3 0
3 XpVt6Z1Gjjo MY FAN . attendance 3 0
4 XpVt6Z1Gjjo trending 😉 3 0
In [6]:
US_videos = pd.read_csv('USvideos.csv', error_bad_lines = False)
In [7]:
US_videos.head()
Out[7]:
video_id title channel_title category_id tags views likes dislikes comment_total thumbnail_link date Unnamed: 11 Unnamed: 12 Unnamed: 13 Unnamed: 14 Unnamed: 15 Unnamed: 16 Unnamed: 17 Unnamed: 18 Unnamed: 19 Unnamed: 20
0 XpVt6Z1Gjjo 1 YEAR OF VLOGGING -- HOW LOGAN PAUL CHANGED Y... Logan Paul Vlogs 24 logan paul vlog|logan paul|logan|paul|olympics... 4394029 320053 5931 46245 https://i.ytimg.com/vi/XpVt6Z1Gjjo/default.jpg 13.09 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 K4wEI5zhHB0 iPhone X — Introducing iPhone X — Apple Apple 28 Apple|iPhone 10|iPhone Ten|iPhone|Portrait Lig... 7860119 185853 26679 0 https://i.ytimg.com/vi/K4wEI5zhHB0/default.jpg 13.09 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 cLdxuaxaQwc My Response PewDiePie 22 [none] 5845909 576597 39774 170708 https://i.ytimg.com/vi/cLdxuaxaQwc/default.jpg 13.09 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 WYYvHb03Eog Apple iPhone X first look The Verge 28 apple iphone x hands on|Apple iPhone X|iPhone ... 2642103 24975 4542 12829 https://i.ytimg.com/vi/WYYvHb03Eog/default.jpg 13.09 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 sjlHnJvXdQs iPhone X (parody) jacksfilms 23 jacksfilms|parody|parodies|iphone|iphone x|iph... 1168130 96666 568 6666 https://i.ytimg.com/vi/sjlHnJvXdQs/default.jpg 13.09 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

Data exploration and wrangle

In [8]:
US_videos.shape
Out[8]:
(7998, 21)
In [9]:
US_videos.nunique()
Out[9]:
video_id          2335
title             2398
channel_title     1230
category_id         16
tags              2204
views             7944
likes             6627
dislikes          2532
comment_total     4153
thumbnail_link    2364
date                43
Unnamed: 11          3
Unnamed: 12          2
Unnamed: 13          2
Unnamed: 14          2
Unnamed: 15          2
Unnamed: 16          2
Unnamed: 17          2
Unnamed: 18          2
Unnamed: 19          2
Unnamed: 20          2
dtype: int64
In [10]:
US_videos.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7998 entries, 0 to 7997
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   video_id        7998 non-null   object 
 1   title           7998 non-null   object 
 2   channel_title   7998 non-null   object 
 3   category_id     7998 non-null   int64  
 4   tags            7998 non-null   object 
 5   views           7998 non-null   int64  
 6   likes           7998 non-null   int64  
 7   dislikes        7998 non-null   int64  
 8   comment_total   7998 non-null   int64  
 9   thumbnail_link  7998 non-null   object 
 10  date            7998 non-null   object 
 11  Unnamed: 11     6 non-null      object 
 12  Unnamed: 12     2 non-null      object 
 13  Unnamed: 13     2 non-null      float64
 14  Unnamed: 14     2 non-null      object 
 15  Unnamed: 15     2 non-null      float64
 16  Unnamed: 16     2 non-null      float64
 17  Unnamed: 17     2 non-null      float64
 18  Unnamed: 18     2 non-null      float64
 19  Unnamed: 19     2 non-null      object 
 20  Unnamed: 20     2 non-null      float64
dtypes: float64(6), int64(5), object(10)
memory usage: 1.3+ MB
In [11]:
US_comments.shape
Out[11]:
(691400, 4)
In [12]:
US_comments.nunique()
Out[12]:
video_id          2266
comment_text    434076
likes             1284
replies            479
dtype: int64
In [13]:
US_comments.isnull().sum()
Out[13]:
video_id         0
comment_text    25
likes            0
replies          0
dtype: int64
In [14]:
US_comments.dropna(inplace=True)
In [15]:
US_comments.isnull().sum()
Out[15]:
video_id        0
comment_text    0
likes           0
replies         0
dtype: int64
In [16]:
US_comments.shape
Out[16]:
(691375, 4)
In [17]:
US_comments.nunique()
Out[17]:
video_id          2266
comment_text    434076
likes             1284
replies            479
dtype: int64
In [18]:
US_comments.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 691375 entries, 0 to 691399
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   video_id      691375 non-null  object
 1   comment_text  691375 non-null  object
 2   likes         691375 non-null  object
 3   replies       691375 non-null  object
dtypes: object(4)
memory usage: 26.4+ MB
In [19]:
US_comments.drop(41587, inplace=True)
In [20]:
US_comments.likes = US_comments.likes.astype(int)
US_comments.replies = US_comments.replies.astype(int)
In [21]:
US_comments.head()
Out[21]:
video_id comment_text likes replies
0 XpVt6Z1Gjjo Logan Paul it's yo big day ‼️‼️‼️ 4 0
1 XpVt6Z1Gjjo I've been following you from the start of your... 3 0
2 XpVt6Z1Gjjo Say hi to Kong and maverick for me 3 0
3 XpVt6Z1Gjjo MY FAN . attendance 3 0
4 XpVt6Z1Gjjo trending 😉 3 0

Time for sentiment analysis on US_comments data

In [22]:
import nltk
nltk.download('vader_lexicon')
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/valazeinali/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Out[22]:
True
In [23]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
In [24]:
US_comments['Sentiment Scores'] = US_comments['comment_text'].apply(lambda x:sia.polarity_scores(x)['compound'])
In [25]:
US_comments.head()
Out[25]:
video_id comment_text likes replies Sentiment Scores
0 XpVt6Z1Gjjo Logan Paul it's yo big day ‼️‼️‼️ 4 0 0.0000
1 XpVt6Z1Gjjo I've been following you from the start of your... 3 0 0.0000
2 XpVt6Z1Gjjo Say hi to Kong and maverick for me 3 0 0.0000
3 XpVt6Z1Gjjo MY FAN . attendance 3 0 0.4648
4 XpVt6Z1Gjjo trending 😉 3 0 0.0000

Classify the sentiment scores as either positive,

negative, or neutral

In [26]:
US_comments['Sentiment'] = US_comments['Sentiment Scores'].apply(lambda s : 'Positive' if s > 0 else ('Neutral' if s == 0 else 'Negative'))
In [27]:
US_comments.head()
Out[27]:
video_id comment_text likes replies Sentiment Scores Sentiment
0 XpVt6Z1Gjjo Logan Paul it's yo big day ‼️‼️‼️ 4 0 0.0000 Neutral
1 XpVt6Z1Gjjo I've been following you from the start of your... 3 0 0.0000 Neutral
2 XpVt6Z1Gjjo Say hi to Kong and maverick for me 3 0 0.0000 Neutral
3 XpVt6Z1Gjjo MY FAN . attendance 3 0 0.4648 Positive
4 XpVt6Z1Gjjo trending 😉 3 0 0.0000 Neutral
In [28]:
US_comments.Sentiment.value_counts()
Out[28]:
Positive    313651
Neutral     235961
Negative    141762
Name: Sentiment, dtype: int64
In [29]:
US_comments.video_id.nunique()
Out[29]:
2266

Number of comments that are positive and creating % of positive comments on videos

In [30]:
US_comments[US_comments.video_id == US_comments.video_id.unique()[0]]['Sentiment'].value_counts()
Out[30]:
Neutral     299
Positive    289
Negative    212
Name: Sentiment, dtype: int64
In [31]:
videos = []
for i in range(0,US_comments.video_id.nunique()):
    a = US_comments[(US_comments.video_id == US_comments.video_id.unique()[i]) & (US_comments.Sentiment == 'Positive')].count()[0]
    b = US_comments[US_comments.video_id == US_comments.video_id.unique()[i]]['Sentiment'].value_counts().sum()
    Percentage = (a/b)*100
    videos.append(round(Percentage,2))

Dataframe of the videos with their comments (Use the two data sets to match primary keys of video post/video comments.

In [32]:
Positivity = pd.DataFrame(videos,US_comments.video_id.unique()).reset_index()
In [33]:
Positivity.columns = ['video_id','Positive Percentage']
In [34]:
Positivity.head()
Out[34]:
video_id Positive Percentage
0 XpVt6Z1Gjjo 36.12
1 cLdxuaxaQwc 38.05
2 WYYvHb03Eog 35.12
3 sjlHnJvXdQs 41.50
4 cMKX2tE5Luk 43.00

Name the video_id's with their Youtube channels

In [35]:
channels = []
for i in range(0,Positivity.video_id.nunique()):
    channels.append(US_videos[US_videos.video_id == Positivity.video_id.unique()[i]]['channel_title'].unique())
In [36]:
Positivity['Channel'] = channels
In [37]:
Positivity.head()
Out[37]:
video_id Positive Percentage Channel
0 XpVt6Z1Gjjo 36.12 [Logan Paul Vlogs]
1 cLdxuaxaQwc 38.05 [PewDiePie]
2 WYYvHb03Eog 35.12 [The Verge]
3 sjlHnJvXdQs 41.50 [jacksfilms]
4 cMKX2tE5Luk 43.00 [A24]

Below shows channels whose comments are marked all positive sentiment

In [38]:
Positivity[Positivity['Positive Percentage'] == Positivity['Positive Percentage'].max()]
Out[38]:
video_id Positive Percentage Channel
132 f5F2whaRzqw 100.0 [Kerry Wong]
395 cviyIIvBlto 100.0 [Kerry Wong]
466 S-a8NVEUO4E 100.0 [E! Live from the Red Carpet]
601 wBjAmThxDpA 100.0 [Vertical Entertainment LA]
612 _zl2GV89_GM 100.0 [Cosmic Book News]
819 D6zUj1tKxiU 100.0 [Xposure 365 TV]
821 lJj66BCiZWQ 100.0 [Eric Blattberg]
972 BNRUV3TuExQ 100.0 [Fathom Events]
1317 zuKX0fPlo2Q 100.0 [Thomas Bikias]
1454 Ekc7lWx0468 100.0 [The Illusion contest]
1587 dsH83p_mfEs 100.0 [The Royal Butler]
1775 UJKl7ToDi20 100.0 [UCF Knights]
1848 KQ19fT4BQQU 100.0 [Arcus-3D]
1911 mi52IqpOp54 100.0 [Kevin Noon]
1912 94U8bow4CU4 100.0 [Shawn Setaro]
1956 W2gnef0LtBE 100.0 [Asmodee Digital]

Below shows channels who comments were negative (The negative influence channels).

In [39]:
Positivity[Positivity['Positive Percentage'] == Positivity['Positive Percentage'].min()]
Out[39]:
video_id Positive Percentage Channel
151 7Ag2oDIeLvo 0.0 [OsbornTramain]
347 EwujR1ARsog 0.0 [Ford Motor Company]
386 Y7tLoqNjkks 0.0 [RAW]
420 p7sooI96zPI 0.0 [Geezus66]
424 jLa9wZHiqxg 0.0 [Rad Universe]
468 Oz88_p125uM 0.0 [ViralVideoUK]
592 BfxOimRxXvU 0.0 [Andro Hack]
604 k8LF_FU2luY 0.0 [Bollinger Motors]
611 tbDr_zAcM5g 0.0 [Maharishi University of Management]
629 HpxsQ1_UAec 0.0 [Manto kay SO Afsanay]
811 EZaPeQ2dxh8 0.0 [Malhar Takle]
820 mShBE_wHAk8 0.0 [Adam Sifounakis]
824 HFXwHcFyU_M 0.0 [NBC Sports]
1069 j8i18Wiaq7Q 0.0 [Al Jazeera Arabic قناة الجزيرة]
1253 jzr49a04olU 0.0 [Luke Skaff]
1254 QWtGGoHT4H8 0.0 [CNBC]
1464 -3AGlBYyLjo 0.0 [CrazyLaughAction]
1467 2bOeaS1wREE 0.0 [Washington Post]
1539 HjfN2Phsfng 0.0 [weatherguru76]
1586 c5RsAXsZvI8 0.0 [Clicker Learning Institute for Cats and Kittens]
2063 k5qKGNeRb68 0.0 [Triz Cru TV]
2235 CEdAjI801Wo 0.0 [BUILD Series]
2237 eLhPdGZaotQ 0.0 [WSUCougarAthletics]

Saving files for further analysis and lookup database

In [40]:
max_positivity = pd.DataFrame(Positivity[Positivity['Positive Percentage'] == Positivity['Positive Percentage'].max()])
In [41]:
min_positivity = pd.DataFrame(Positivity[Positivity['Positive Percentage'] == Positivity['Positive Percentage'].min()])
In [42]:
min_positivity.to_csv("Worst_Channel_influence.csv") #these are the channels with the worst comments on their videos
In [43]:
max_positivity.to_csv("Best_Channel_influence.csv") #these are the channels with the best comments on their videos
In [44]:
all_channel_sentiment = pd.DataFrame(Positivity[Positivity['Positive Percentage'] == Positivity['Positive Percentage']])
In [45]:
all_channel_sentiment.to_csv("All_Channel_Sentiment.csv")

Lets explore our sentiment values with some data analysis!

In [46]:
sns.set_palette('husl')
In [47]:
%pylab inline
Populating the interactive namespace from numpy and matplotlib
In [48]:
import math
In [49]:
data = pd.read_csv('All_Channel_Sentiment_OG.csv')
In [50]:
data.head()
Out[50]:
ID Positive_Percentage Channel Unnamed: 3
0 XpVt6Z1Gjjo 36 Logan Paul Vlogs NaN
1 cLdxuaxaQwc 38 PewDiePie NaN
2 WYYvHb03Eog 35 The Verge NaN
3 sjlHnJvXdQs 42 jacksfilms NaN
4 cMKX2tE5Luk 43 A24 NaN
In [51]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2266 entries, 0 to 2265
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2266 non-null   object 
 1   Positive_Percentage  2266 non-null   int64  
 2   Channel              2266 non-null   object 
 3   Unnamed: 3           0 non-null      float64
dtypes: float64(1), int64(1), object(2)
memory usage: 70.9+ KB
In [52]:
data.describe()
Out[52]:
Positive_Percentage Unnamed: 3
count 2266.000000 0.0
mean 45.812445 NaN
std 15.368947 NaN
min 0.000000 NaN
25% 36.000000 NaN
50% 44.000000 NaN
75% 55.000000 NaN
max 100.000000 NaN
In [53]:
data["Positive_Percentage"].describe()
Out[53]:
count    2266.000000
mean       45.812445
std        15.368947
min         0.000000
25%        36.000000
50%        44.000000
75%        55.000000
max       100.000000
Name: Positive_Percentage, dtype: float64
In [54]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%pylab inline
Populating the interactive namespace from numpy and matplotlib

Distribution of positive percentages across the board

In [55]:
sns.set_palette('husl')
plt.figure(figsize=(22.5,10))
sns.distplot(data["Positive_Percentage"], color="skyblue", label="Toxicity Distribution")
Out[55]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe194d4bb10>