import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import stats

%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns',None)

Lets load in our two datasets. One data set is the top trending videos in the united states and the other data set is the comments under the trending videos.¶

US_comments = pd.read_csv('UScomments.csv', error_bad_lines = False)

b'Skipping line 41589: expected 4 fields, saw 11\nSkipping line 51628: expected 4 fields, saw 7\nSkipping line 114465: expected 4 fields, saw 5\n'
b'Skipping line 142496: expected 4 fields, saw 8\nSkipping line 189732: expected 4 fields, saw 6\nSkipping line 245218: expected 4 fields, saw 7\n'
b'Skipping line 388430: expected 4 fields, saw 5\n'

US_comments.head()

US_videos = pd.read_csv('USvideos.csv', error_bad_lines = False)

US_videos.head()

Data exploration and wrangle¶

US_videos.shape

(7998, 21)

US_videos.nunique()

video_id          2335
title             2398
channel_title     1230
category_id         16
tags              2204
views             7944
likes             6627
dislikes          2532
comment_total     4153
thumbnail_link    2364
date                43
Unnamed: 11          3
Unnamed: 12          2
Unnamed: 13          2
Unnamed: 14          2
Unnamed: 15          2
Unnamed: 16          2
Unnamed: 17          2
Unnamed: 18          2
Unnamed: 19          2
Unnamed: 20          2
dtype: int64

US_videos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7998 entries, 0 to 7997
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   video_id        7998 non-null   object 
 1   title           7998 non-null   object 
 2   channel_title   7998 non-null   object 
 3   category_id     7998 non-null   int64  
 4   tags            7998 non-null   object 
 5   views           7998 non-null   int64  
 6   likes           7998 non-null   int64  
 7   dislikes        7998 non-null   int64  
 8   comment_total   7998 non-null   int64  
 9   thumbnail_link  7998 non-null   object 
 10  date            7998 non-null   object 
 11  Unnamed: 11     6 non-null      object 
 12  Unnamed: 12     2 non-null      object 
 13  Unnamed: 13     2 non-null      float64
 14  Unnamed: 14     2 non-null      object 
 15  Unnamed: 15     2 non-null      float64
 16  Unnamed: 16     2 non-null      float64
 17  Unnamed: 17     2 non-null      float64
 18  Unnamed: 18     2 non-null      float64
 19  Unnamed: 19     2 non-null      object 
 20  Unnamed: 20     2 non-null      float64
dtypes: float64(6), int64(5), object(10)
memory usage: 1.3+ MB

US_comments.shape

(691400, 4)

US_comments.nunique()

video_id          2266
comment_text    434076
likes             1284
replies            479
dtype: int64

US_comments.isnull().sum()

video_id         0
comment_text    25
likes            0
replies          0
dtype: int64

US_comments.dropna(inplace=True)

US_comments.isnull().sum()

video_id        0
comment_text    0
likes           0
replies         0
dtype: int64

US_comments.shape

(691375, 4)

US_comments.nunique()

video_id          2266
comment_text    434076
likes             1284
replies            479
dtype: int64

US_comments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 691375 entries, 0 to 691399
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   video_id      691375 non-null  object
 1   comment_text  691375 non-null  object
 2   likes         691375 non-null  object
 3   replies       691375 non-null  object
dtypes: object(4)
memory usage: 26.4+ MB

US_comments.drop(41587, inplace=True)

US_comments.likes = US_comments.likes.astype(int)
US_comments.replies = US_comments.replies.astype(int)

US_comments.head()

Time for sentiment analysis on US_comments data¶

import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/valazeinali/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!

True

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

US_comments['Sentiment Scores'] = US_comments['comment_text'].apply(lambda x:sia.polarity_scores(x)['compound'])

US_comments.head()

Classify the sentiment scores as either positive,¶

negative, or neutral¶

US_comments['Sentiment'] = US_comments['Sentiment Scores'].apply(lambda s : 'Positive' if s > 0 else ('Neutral' if s == 0 else 'Negative'))

US_comments.head()

US_comments.Sentiment.value_counts()

Positive    313651
Neutral     235961
Negative    141762
Name: Sentiment, dtype: int64

US_comments.video_id.nunique()

2266

Number of comments that are positive and creating % of positive comments on videos¶

US_comments[US_comments.video_id == US_comments.video_id.unique()[0]]['Sentiment'].value_counts()

Neutral     299
Positive    289
Negative    212
Name: Sentiment, dtype: int64

videos = []
for i in range(0,US_comments.video_id.nunique()):
    a = US_comments[(US_comments.video_id == US_comments.video_id.unique()[i]) & (US_comments.Sentiment == 'Positive')].count()[0]
    b = US_comments[US_comments.video_id == US_comments.video_id.unique()[i]]['Sentiment'].value_counts().sum()
    Percentage = (a/b)*100
    videos.append(round(Percentage,2))

Dataframe of the videos with their comments (Use the two data sets to match primary keys of video post/video comments.¶

Positivity = pd.DataFrame(videos,US_comments.video_id.unique()).reset_index()

Positivity.columns = ['video_id','Positive Percentage']

Positivity.head()

Name the video_id's with their Youtube channels¶

channels = []
for i in range(0,Positivity.video_id.nunique()):
    channels.append(US_videos[US_videos.video_id == Positivity.video_id.unique()[i]]['channel_title'].unique())

Positivity['Channel'] = channels

Positivity.head()

Below shows channels whose comments are marked all positive sentiment¶

Positivity[Positivity['Positive Percentage'] == Positivity['Positive Percentage'].max()]

Below shows channels who comments were negative (The negative influence channels).¶

Positivity[Positivity['Positive Percentage'] == Positivity['Positive Percentage'].min()]

Saving files for further analysis and lookup database¶

max_positivity = pd.DataFrame(Positivity[Positivity['Positive Percentage'] == Positivity['Positive Percentage'].max()])

min_positivity = pd.DataFrame(Positivity[Positivity['Positive Percentage'] == Positivity['Positive Percentage'].min()])

min_positivity.to_csv("Worst_Channel_influence.csv") #these are the channels with the worst comments on their videos

max_positivity.to_csv("Best_Channel_influence.csv") #these are the channels with the best comments on their videos

all_channel_sentiment = pd.DataFrame(Positivity[Positivity['Positive Percentage'] == Positivity['Positive Percentage']])

all_channel_sentiment.to_csv("All_Channel_Sentiment.csv")

Lets explore our sentiment values with some data analysis!¶

sns.set_palette('husl')

%pylab inline

Populating the interactive namespace from numpy and matplotlib

import math

data = pd.read_csv('All_Channel_Sentiment_OG.csv')

data.head()

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2266 entries, 0 to 2265
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2266 non-null   object 
 1   Positive_Percentage  2266 non-null   int64  
 2   Channel              2266 non-null   object 
 3   Unnamed: 3           0 non-null      float64
dtypes: float64(1), int64(1), object(2)
memory usage: 70.9+ KB

data.describe()

data["Positive_Percentage"].describe()

count    2266.000000
mean       45.812445
std        15.368947
min         0.000000
25%        36.000000
50%        44.000000
75%        55.000000
max       100.000000
Name: Positive_Percentage, dtype: float64

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib

Distribution of positive percentages across the board¶

sns.set_palette('husl')
plt.figure(figsize=(22.5,10))
sns.distplot(data["Positive_Percentage"], color="skyblue", label="Toxicity Distribution")

<matplotlib.axes._subplots.AxesSubplot at 0x7fe194d4bb10>

	video_id	comment_text	likes
0	XpVt6Z1Gjjo	Logan Paul it's yo big day ‼️‼️‼️	4
1	XpVt6Z1Gjjo	I've been following you from the start of your...	3
2	XpVt6Z1Gjjo	Say hi to Kong and maverick for me	3
3	XpVt6Z1Gjjo	MY FAN . attendance	3
4	XpVt6Z1Gjjo	trending 😉	3

	video_id	title	channel_title	category_id	tags	views	likes	dislikes	comment_total	thumbnail_link	date	Unnamed: 11	Unnamed: 12	Unnamed: 13	Unnamed: 14	Unnamed: 15	Unnamed: 16	Unnamed: 17	Unnamed: 18	Unnamed: 19	Unnamed: 20
0	XpVt6Z1Gjjo	1 YEAR OF VLOGGING -- HOW LOGAN PAUL CHANGED Y...	Logan Paul Vlogs	24	logan paul vlog\|logan paul\|logan\|paul\|olympics...	4394029	320053	5931	46245	https://i.ytimg.com/vi/XpVt6Z1Gjjo/default.jpg	13.09	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	K4wEI5zhHB0	iPhone X — Introducing iPhone X — Apple	Apple	28	Apple\|iPhone 10\|iPhone Ten\|iPhone\|Portrait Lig...	7860119	185853	26679	0	https://i.ytimg.com/vi/K4wEI5zhHB0/default.jpg	13.09	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	cLdxuaxaQwc	My Response	PewDiePie	22	[none]	5845909	576597	39774	170708	https://i.ytimg.com/vi/cLdxuaxaQwc/default.jpg	13.09	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	WYYvHb03Eog	Apple iPhone X first look	The Verge	28	apple iphone x hands on\|Apple iPhone X\|iPhone ...	2642103	24975	4542	12829	https://i.ytimg.com/vi/WYYvHb03Eog/default.jpg	13.09	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	sjlHnJvXdQs	iPhone X (parody)	jacksfilms	23	jacksfilms\|parody\|parodies\|iphone\|iphone x\|iph...	1168130	96666	568	6666	https://i.ytimg.com/vi/sjlHnJvXdQs/default.jpg	13.09	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	video_id	comment_text	likes
0	XpVt6Z1Gjjo	Logan Paul it's yo big day ‼️‼️‼️	4
1	XpVt6Z1Gjjo	I've been following you from the start of your...	3
2	XpVt6Z1Gjjo	Say hi to Kong and maverick for me	3
3	XpVt6Z1Gjjo	MY FAN . attendance	3
4	XpVt6Z1Gjjo	trending 😉	3

	video_id	comment_text	likes	Sentiment Scores
0	XpVt6Z1Gjjo	Logan Paul it's yo big day ‼️‼️‼️	4	0.0000
1	XpVt6Z1Gjjo	I've been following you from the start of your...	3	0.0000
2	XpVt6Z1Gjjo	Say hi to Kong and maverick for me	3	0.0000
3	XpVt6Z1Gjjo	MY FAN . attendance	3	0.4648
4	XpVt6Z1Gjjo	trending 😉	3	0.0000

	video_id	Positive Percentage
0	XpVt6Z1Gjjo	36.12
1	cLdxuaxaQwc	38.05
2	WYYvHb03Eog	35.12
3	sjlHnJvXdQs	41.50
4	cMKX2tE5Luk	43.00

	video_id	Positive Percentage	Channel
0	XpVt6Z1Gjjo	36.12	[Logan Paul Vlogs]
1	cLdxuaxaQwc	38.05	[PewDiePie]
2	WYYvHb03Eog	35.12	[The Verge]
3	sjlHnJvXdQs	41.50	[jacksfilms]
4	cMKX2tE5Luk	43.00	[A24]

	video_id	Positive Percentage	Channel
132	f5F2whaRzqw	100.0	[Kerry Wong]
395	cviyIIvBlto	100.0	[Kerry Wong]
466	S-a8NVEUO4E	100.0	[E! Live from the Red Carpet]
601	wBjAmThxDpA	100.0	[Vertical Entertainment LA]
612	_zl2GV89_GM	100.0	[Cosmic Book News]
819	D6zUj1tKxiU	100.0	[Xposure 365 TV]
821	lJj66BCiZWQ	100.0	[Eric Blattberg]
972	BNRUV3TuExQ	100.0	[Fathom Events]
1317	zuKX0fPlo2Q	100.0	[Thomas Bikias]
1454	Ekc7lWx0468	100.0	[The Illusion contest]
1587	dsH83p_mfEs	100.0	[The Royal Butler]
1775	UJKl7ToDi20	100.0	[UCF Knights]
1848	KQ19fT4BQQU	100.0	[Arcus-3D]
1911	mi52IqpOp54	100.0	[Kevin Noon]
1912	94U8bow4CU4	100.0	[Shawn Setaro]
1956	W2gnef0LtBE	100.0	[Asmodee Digital]

	video_id	Channel
151	7Ag2oDIeLvo	[OsbornTramain]
347	EwujR1ARsog	[Ford Motor Company]
386	Y7tLoqNjkks	[RAW]
420	p7sooI96zPI	[Geezus66]
424	jLa9wZHiqxg	[Rad Universe]
468	Oz88_p125uM	[ViralVideoUK]
592	BfxOimRxXvU	[Andro Hack]
604	k8LF_FU2luY	[Bollinger Motors]
611	tbDr_zAcM5g	[Maharishi University of Management]
629	HpxsQ1_UAec	[Manto kay SO Afsanay]
811	EZaPeQ2dxh8	[Malhar Takle]
820	mShBE_wHAk8	[Adam Sifounakis]
824	HFXwHcFyU_M	[NBC Sports]
1069	j8i18Wiaq7Q	[Al Jazeera Arabic قناة الجزيرة]
1253	jzr49a04olU	[Luke Skaff]
1254	QWtGGoHT4H8	[CNBC]
1464	-3AGlBYyLjo	[CrazyLaughAction]
1467	2bOeaS1wREE	[Washington Post]
1539	HjfN2Phsfng	[weatherguru76]
1586	c5RsAXsZvI8	[Clicker Learning Institute for Cats and Kittens]
2063	k5qKGNeRb68	[Triz Cru TV]
2235	CEdAjI801Wo	[BUILD Series]
2237	eLhPdGZaotQ	[WSUCougarAthletics]

	Positive_Percentage	Unnamed: 3
count	2266.000000	0.0
mean	45.812445	NaN
std	15.368947	NaN
min	0.000000	NaN
25%	36.000000	NaN
50%	44.000000	NaN
75%	55.000000	NaN
max	100.000000	NaN