DTSA 5580 Network Analysis Final Project
Important Note: Please click on the specific html files to see the weights (hover-able) and explore node names/words more in-depth.
Load Packages
import gzip
import json
import nltk
import glob
import os
import shutil
import json
import csv
import networkx as nx
import matplotlib.pyplot as plt
try:
import pyvis
from pyvis.network import Network
except:
!pip install pyvis
import pyvis
from pyvis import Network
from time import sleep
import nltk
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()
import re
import shutil
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
import string
import itertools
punctuation = string.punctuation
stopwordsset = set(stopwords.words("english"))
stopwordsset.add('rt')
stopwordsset.add("'s")
from datetime import datetime
import pandas as pd
from IPython.core.display import display, HTML
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data] /root/nltk_data...
[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.
True
Load Data
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
Define Extract Functions
def extract_mention(tweet, mention):
if 'user_mentions' in tweet['entities'].keys():
t_mentions = tweet['entities']['user_mentions']
for i in t_mentions:
if (i['screen_name'].upper() == mention.upper()) or (i['name'].upper() == mention.upper()):
return 1
else: return 0
def filter_tweets(tweets, mention):
for tweet in tweets:
if isinstance(tweet, (bytes, str)):
tweet = json.loads(tweet)
if extract_mention(tweet, mention):
yield (tweet['user']['screen_name'], mention, tweet)
# Taken from Week 4 Lecture Notebook
#Removing urls
def removeURL(text):
result = re.sub(r"http\S+", "", text)
result = re.sub(r"’", "", result) # more special characters not coded as punctuation
result = re.sub(r"“", "", result)
result = re.sub(r"”", "", result)
result = re.sub(r"—", "", result)
result = re.sub(r"…", "", result)
return result
#removes useless words such as a, an, the
def stopWords(tokenizedtext):
goodwords = []
for aword in tokenizedtext:
if aword not in stopwordsset:
goodwords.append(aword)
return goodwords
# feature reduction. taking words and getting their roots and graphing only the root words
def lemmatizer(tokenizedtext):
lemmawords = []
for aword in tokenizedtext:
aword = wn.lemmatize(aword)
lemmawords.append(aword)
return lemmawords
#inputs a list of tokens and returns a list of unpunctuated tokens/words
def removePunctuation(tokenizedtext):
nopunctwords = []
for aword in tokenizedtext:
if aword not in punctuation:
nopunctwords.append(aword)
cleanedwords = []
for aword in nopunctwords:
aword = aword.translate(str.maketrans('', '', string.punctuation))
cleanedwords.append(aword)
return cleanedwords
def removesinglewords(tokenizedtext):
goodwords = []
for a_feature in tokenizedtext:
if len(a_feature) > 1:
goodwords.append(a_feature)
return goodwords
# Adapted from Week 4 Lab
def token_counts(tweets, tagger=nltk.tag.PerceptronTagger().tag, tokenizer=nltk.TweetTokenizer().tokenize, parts_of_speech=None):
if parts_of_speech == None:
parts_of_speech = []
token_dict = {}
for tweet in tweets:
if isinstance(tweet, (bytes, str)):
tweet = json.loads(tweet)
if 'full_text' in tweet.keys():
tweet_text = tweet['full_text']
else: tweet_text = tweet['text']
tweet_text = removeURL(tweet_text)
token = tokenizer(tweet_text)
token = stopWords(token)
token = lemmatizer(token)
token = removePunctuation(token)
tags = tagger(token)
if len(tags) == 0: continue
if len(parts_of_speech) == 0:
for i in tags:
if i[0] in token_dict:
token_dict[i[0]] += 1
else:
token_dict[i[0]] = 1
for i in tags:
if i == None: continue
elif i[1] in parts_of_speech:
if i[0] in token_dict:
token_dict[i[0]] += 1
else:
token_dict[i[0]] = 1
return token_dict
def tweet_sentiment(tweet, tokenizer=nltk.TweetTokenizer().tokenize):
if 'full_text' in tweet.keys():
tweet_text = tweet['full_text']
else: tweet_text = tweet['text']
tweet_text = removeURL(tweet_text)
token = tokenizer(tweet_text)
token = stopWords(token)
token = lemmatizer(token)
token = removePunctuation(token)
sentence = ' '.join(token)
sentim_analyzer = SentimentIntensityAnalyzer()
scores = sentim_analyzer.polarity_scores(sentence)
return scores['compound']
Extract Tweets with an @ Reference to Companies
with gzip.open('drive/MyDrive/nikelululemonadidas_tweets.jsonl.gz') as f:
lululemon = list(filter_tweets(f, 'lululemon'))
with gzip.open('drive/MyDrive/nikelululemonadidas_tweets.jsonl.gz') as g:
nike = list(filter_tweets(g, 'nike'))
with gzip.open('drive/MyDrive/nikelululemonadidas_tweets.jsonl.gz') as h:
adidas = list(filter_tweets(h, 'adidas'))
print('Lululemon Tweet Mentions:', len(lululemon))
print('Nike Tweet Mentions:', len(nike))
print('Adidas Tweet Mentions:', len(adidas))
Lululemon Tweet Mentions: 6168
Nike Tweet Mentions: 118953
Adidas Tweet Mentions: 36485
Central Users
We can start by investigating the key users and their tweets. We will first create a subset of the top 100 users per segment.
nike_df = pd.DataFrame(nike, columns=['user','segment', 'tweet'])
adidas_df = pd.DataFrame(adidas, columns=['user','segment', 'tweet'])
lululemon_df = pd.DataFrame(lululemon, columns=['user','segment', 'tweet'])
nike_df['user_description'] = nike_df['tweet'].apply(lambda x: x['user']['description'])
adidas_df['user_description'] = lululemon_df['tweet'].apply(lambda x: x['user']['description'])
lululemon_df['user_description'] = lululemon_df['tweet'].apply(lambda x: x['user']['description'])
nike_df['user_description'] = nike_df['user_description'].fillna('N/A')
adidas_df['user_description'] = adidas_df['user_description'].fillna('N/A')
lululemon_df['user_description'] = lululemon_df['user_description'].fillna('N/A')
nike_df['user'].value_counts(ascending=False).rename_axis('user').reset_index(name='counts').head(20).merge(nike_df[['user','user_description']].groupby('user',as_index=False).max(), how='left', on='user')
| user | counts | user_description | |
|---|---|---|---|
| 0 | SneakerScouts | 6891 | The #1 source for sneaker news, release dates,... |
| 1 | HUsBadGuys | 4067 | HU's Bad Guys #HBOW |
| 2 | Kaya_Alexander5 | 720 | Just a girl who loves her sneakers. The sneake... |
| 3 | Stealth783 | 590 | |
| 4 | GirardisGod | 530 | My 2 Instagram accounts are (@girardisgodgram)... |
| 5 | ShockandAweEnt | 361 | Providing a broad range of entertainment aroun... |
| 6 | vadriano2000 | 334 | |
| 7 | turtlepace5 | 321 | CT born, WI raised. Packers, Auntie, Coffee, S... |
| 8 | SSBrandon | 278 | @Nike Apostle & #SNKRS VET, who pledged allegi... |
| 9 | zen_masstah | 271 | SNKR head, hip hop, anti influencer, hater of ... |
| 10 | jadendaly | 247 | Please allow me to introduce myself: I’m a man... |
| 11 | DJBLUIZ | 212 | Dj/Sneakerhead 👟10.5-11 - Cowboys-Knicks-Devil... |
| 12 | levibrian86 | 207 | chef, private sec., vocal singing,let u know ... |
| 13 | therealJCW | 205 | #SneakerScouts @SneakerScouts @ShockandAweEnt |
| 14 | efiorentino31 | 203 | ☀︎︎ ♍︎ ☽ ♓︎ ❥ sneakers & makeup ♥︎ |
| 15 | Moonman989 | 177 | #AJ1FAM |
| 16 | joshuajhan | 171 | 🇰🇷 916 English Bulldogs Nike Jumpman sneakerhe... |
| 17 | beiberlove69 | 157 | it's a big hat, it's funny\nsz13 crew |
| 18 | BoysRevelacion | 137 | |
| 19 | DoBetterBB | 136 |
adidas_df['user'].value_counts(ascending=False).rename_axis('user').reset_index(name='counts').head(20).merge(adidas_df[['user','user_description']].groupby('user',as_index=False).max(), how='left', on='user')
| user | counts | user_description | |
|---|---|---|---|
| 0 | BoysRevelacion | 137 | test pilot for pies. no thoughts, no vibes. #b... |
| 1 | zen_masstah | 123 | N/A |
| 2 | bleustar9757 | 108 | N/A |
| 3 | jajuanmharley | 89 | 🏳️🌈Health educator and certified fitness trai... |
| 4 | turtlepace5 | 68 | N/A |
| 5 | TheRealMGesta | 68 | 🌎 #VoteBlue2022 #NeverGOP #WeVote,WeWin\nNo u... |
| 6 | restebanrf1993 | 67 | Transport Writer and Editor En/Fr |
| 7 | KVSwitzer | 53 | likes = 🙄 / 🥰/ 🤣 / 🤬 / 🥴 |
| 8 | GrossAmilee | 39 | N/A |
| 9 | natmmom | 35 | 🇱🇧West Coast Phalange Supporter🇱🇧 Retvrn to Af... |
| 10 | wearekrimy | 35 | So Cal born and raised. Athletic Trainer, Phys... |
| 11 | Moonman989 | 34 | N/A |
| 12 | erik102079 | 34 | N/A |
| 13 | FootwearNews | 33 | photo:@linksgems |
| 14 | golacokits | 31 | N/A |
| 15 | kaflickinger74 | 30 | seo | snack enthusiast | photography | grandma... |
| 16 | josiethewonder | 27 | Photography📸 Cane Creek Distillery🥃 |
| 17 | DeionPatterson1 | 27 | N/A |
| 18 | BPrince95 | 26 | welcome to the clown show |
| 19 | JamieGeorge93 | 25 | N/A |
lululemon_df['user'].value_counts(ascending=False).rename_axis('user').reset_index(name='counts').head(20).merge(lululemon_df[['user','user_description']].groupby('user',as_index=False).max(), how='left', on='user')
| user | counts | user_description | |
|---|---|---|---|
| 0 | JimAceman | 32 | Mammal, Partner, Father, Son, Brother, Friend,... |
| 1 | MattMully | 31 | Be...excellent to eachother. |
| 2 | kinseyfit | 24 | “Fit & Fearless” - where we believe it’s not j... |
| 3 | WhatAndrewSaid | 23 | “so great looking and smart, a true Stable Gen... |
| 4 | MasashiKohmura | 20 | Cross-county skiing information specialized in... |
| 5 | liab9845 | 20 | |
| 6 | blythelia3505 | 16 | $jillnauyokas |
| 7 | Chrisblythe9845 | 16 | Food & Drink Funny Fashion Television Music Di... |
| 8 | DeezeFi | 14 | all I know is I don't know nothing | lindy wal... |
| 9 | gomerland2 | 14 | Survivor and researcher as I embark in a journ... |
| 10 | MrLeonardKim | 14 | I feel the pain in your heart and I know what ... |
| 11 | 365yogadream | 13 | Ambassador @lululemon -Nutrition -Emergency Pr... |
| 12 | C_kelly1988 | 13 | ⚾️ 🌳 🥾 👟 size 11-12. Cincy sports fan. #teamcr... |
| 13 | MattTooze | 13 | Experienced ex athlete 800m 1.57.3. sub 16 5k ... |
| 14 | AFineBlogger | 13 | VP East 86th St Assoc, created https://t.co/Uz... |
| 15 | cloudwhiteNFT | 12 | financial philosopher. @axieinfinity evangelis... |
| 16 | lulunotify | 11 | The first monitor service for #lululemon drops... |
| 17 | TheSportsIndex | 11 | “THE bellwether stock market index for sports.... |
| 18 | sean_broedow | 11 | I’m a runner, a 2021 Legacy Nuun Ambassador, 2... |
| 19 | aleman305 | 10 | BJJ, MMA, CrossFit, and Olympic Weightlifting |
Just looking at the top users for each company, we can see that users who at (@) tweet Nike are often sneakerheads, or people who either collect or buy lots of sneakers. Both Adidas and Lululemon’s users seem to be a more random assortment of people. Lululemon does, however, seem to be at (@) tweeted by a lot of fitness and health inclined people.
top_nike = nike_df['user'].value_counts(ascending=False).rename_axis('user').reset_index(name='counts').head(100)['user'].values
top_adidas = adidas_df['user'].value_counts(ascending=False).rename_axis('user').reset_index(name='counts').head(100)['user'].values
top_lululemon = lululemon_df['user'].value_counts(ascending=False).rename_axis('user').reset_index(name='counts').head(100)['user'].values
nt = Network('1200', '1600', directed=True, notebook=True, cdn_resources='remote')
Graph = nx.DiGraph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')
tweet_groups = [lululemon, adidas, nike]
tweet_users = [top_lululemon, top_adidas, top_nike]
for lst in range(len(tweet_groups)):
for i in range(len(tweet_groups[lst])):
if tweet_groups[lst][i][0] in tweet_users[lst]:
if Graph.has_edge(tweet_groups[lst][i][0], tweet_groups[lst][i][1]):
w = Graph.edges[tweet_groups[lst][i][0], tweet_groups[lst][i][1]]['title']
Graph.edges[tweet_groups[lst][i][0], tweet_groups[lst][i][1]]['weight'] = ((w+1) //100) +1
Graph.edges[tweet_groups[lst][i][0], tweet_groups[lst][i][1]]['title'] = w+1
else: Graph.add_edge(tweet_groups[lst][i][0], tweet_groups[lst][i][1], weight=1, title=1)
nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/user_mention_pyvis.html")
print('Nodes:', Graph.order())
print('Edges:', Graph.size())
Nodes: 285
Edges: 300
Analysis
We can see from the pyvis network graph that of the top 100 users for each company, the red cluster (Nike) and yellow cluster (Adidas) had more users in common than the blue cluster (Lululemon). This makes sense as both Nike and Adidas are primarily known for their shoes, while Lululemon is known for their clothing. Similarly, even when comparing clothing, which Nike and Adidas also manufacture, they primarily focus on workout clothing, while Lululemon is famous for it’s yoga attire.
Similarly, when looking at the number and size of node linkages, Nike has lots of users who continually at (@) tweet them. One user, SneakerScouts, tweeted almost 7,000 times within the dataset. Comparatively, Adidas and Lululemon did not have any followers who tweeted at (@) them that much.
There do seem to be a handful of users who tweet at both Nike and Adidas.
We can now investigate the key words these top users are using in their quote tweets.
lululemon_df_users = lululemon_df[lululemon_df['user'].isin(top_lululemon)]
adidas_df_users = adidas_df[adidas_df['user'].isin(top_adidas)]
nike_df_users = nike_df[nike_df['user'].isin(top_nike)]
lululemon_user_tweets = lululemon_df_users['tweet'].values
adidas_user_tweets = adidas_df_users['tweet'].values
nike_user_tweets = nike_df_users['tweet'].values
lululemon_vocab_df = pd.DataFrame(list(token_counts(list(lululemon_user_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count']) #Adjective, Nouns, and Verbs Only
adidas_vocab_df = pd.DataFrame(list(token_counts(list(adidas_user_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nike_vocab_df = pd.DataFrame(list(token_counts(list(nike_user_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nt = Network('1200', '1600', directed=False, notebook=True, cdn_resources='remote')
Graph = nx.Graph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')
vocab_df = [lululemon_vocab_df, adidas_vocab_df, nike_vocab_df]
tweet_groups = ['lululemon', 'adidas', 'nike']
for i in range(len(vocab_df)):
temp = vocab_df[i].sort_values('count', ascending=False).head(150)
for j in range(len(temp)):
Graph.add_edge(temp.iloc[j]['word'].lower(), tweet_groups[i], weight=int(temp.iloc[j]['count']) // 100, title=int(temp.iloc[j]['count']))
nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/user_words_pyvis.html")
print('Nodes:', Graph.order())
print('Edges:', Graph.size())
Nodes: 386
Edges: 433
Analysis
We can see that Nike in particular is referenced much more in at (@) tweets than other companies. Interestingly enough, Nike’s at (@) tweets also had a higher frequency of ‘RT,’ or retweet, than the other companies. Nike’s at (@) tweets also had a higher frequency of ‘available,’ which could indicate that more people are asking if shoes or other Nike products are available for purchase.
Lululemon had more emojis as frequent words than other companies. Nike’s most frequent word, however, was a reference to it’s most frequent at (@) tweeter, which seems to be a sneaker new site or account.
Segmentation by Follower Count
Because the number of unique users from the entire subset of tweets is so large, we can split the twitter mentions graph into two sections: users with over 50,000 followers and users between 4,300 and 5,000. This was done to limit the number of nodes on a graph and provide a similar number of nodes between network graphs.
nt = Network('1200', '1600', directed=True, notebook=True, cdn_resources='remote')
Graph = nx.DiGraph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')
tweet_groups = [lululemon, adidas, nike]
for lst in tweet_groups:
for i in range(len(lst)):
if lst[i][2]['user']['followers_count'] >= 50000:
if Graph.has_edge(lst[i][0], lst[i][1]):
w = Graph.edges[lst[i][0], lst[i][1]]['title']
Graph.edges[lst[i][0], lst[i][1]]['weight'] = ((w+1) //2) +1
Graph.edges[lst[i][0], lst[i][1]]['title'] = w+1
else: Graph.add_edge(lst[i][0], lst[i][1], weight=1, title=1)
nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/high_follower_mention_pyvis.html")
print('Nodes:', Graph.order())
print('Edges:', Graph.size())
Nodes: 285
Edges: 300
Analysis
We can see that most of the high follower count users at (@) tweet Nike more than the other brands. Adidas is (@) tweeted less and Lululemon even less so. We can also see there is a similar distribution to the number of users who at (@) tweet multiple companies. Most users who at (@) tweet two companies will (@) tweet Nike and Adidas. There are a very few amount of users who at (@) tweet all three companies.
nt = Network('1200', '1600', directed=True, notebook=True, cdn_resources='remote')
Graph = nx.DiGraph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')
tweet_groups = [lululemon, adidas, nike]
for lst in tweet_groups:
for i in range(len(lst)):
if (lst[i][2]['user']['followers_count'] >= 4300) & (lst[i][2]['user']['followers_count'] <= 5000):
if Graph.has_edge(lst[i][0], lst[i][1]):
w = Graph.edges[lst[i][0], lst[i][1]]['title']
Graph.edges[lst[i][0], lst[i][1]]['weight'] = ((w+1) //2) +1
Graph.edges[lst[i][0], lst[i][1]]['title'] = w+1
else: Graph.add_edge(lst[i][0], lst[i][1], weight=1, title=1)
nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/low_follower_mention_pyvis.html")
print('Nodes:', Graph.order())
print('Edges:', Graph.size())
Nodes: 805
Edges: 869
Analysis
We can see that the distribution of at (@) tweets from lower follower tweeters is very similar to the high follower tweeters.
We can now investigate the semantic network graphs created from these two segments.
lululemon_df['followers'] = lululemon_df['tweet'].apply(lambda x: x['user']['followers_count'])
adidas_df['followers'] = adidas_df['tweet'].apply(lambda x: x['user']['followers_count'])
nike_df['followers'] = nike_df['tweet'].apply(lambda x: x['user']['followers_count'])
lululemon_df_high_follower = lululemon_df[lululemon_df['followers'] >= 50000]
adidas_df_high_follower = adidas_df[adidas_df['followers'] >= 50000]
nike_df_high_follower = nike_df[nike_df['followers'] >= 50000]
lululemon_high_follower_tweets = lululemon_df_high_follower['tweet'].values
adidas_high_follower_tweets = adidas_df_high_follower['tweet'].values
nike_high_follower_tweets = nike_df_high_follower['tweet'].values
lululemon_vocab_df = pd.DataFrame(list(token_counts(list(lululemon_high_follower_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count']) #Adjective, Nouns, and Verbs Only
adidas_vocab_df = pd.DataFrame(list(token_counts(list(adidas_high_follower_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nike_vocab_df = pd.DataFrame(list(token_counts(list(nike_high_follower_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nt = Network('1200', '1600', directed=False, notebook=True, cdn_resources='remote')
Graph = nx.Graph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')
vocab_df = [lululemon_vocab_df, adidas_vocab_df, nike_vocab_df]
tweet_groups = ['lululemon', 'adidas', 'nike']
for i in range(len(vocab_df)):
temp = vocab_df[i].sort_values('count', ascending=False).head(150)
for j in range(len(temp)):
Graph.add_edge(temp.iloc[j]['word'].lower(), tweet_groups[i], weight=int(temp.iloc[j]['count']) // 100, title=int(temp.iloc[j]['count']))
nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/high_follower_words_pyvis.html")
print('Nodes:', Graph.order())
print('Edges:', Graph.size())
Nodes: 338
Edges: 436
Analysis
We can see that the top 150 words associated with each of the brands from tweets with a high follower count are distributed similarly to the previous semantic network graphs. Both Nike and Adidas have more word overlap than with Lululemon. There are a few words that all three companies have in common, but all words are fairly common on twitter and when discussing a company, including rt (re-tweet), store, and online.
Interestingly, a fair amount of overlap words from Nike and Adidas are other companies (xbox, starbucks, subway, etc.). Both Adidas and Lululemon have a handful of emoji’s as their most frequent words.
lululemon_df_low_follower = lululemon_df[(lululemon_df['followers'] >= 4300)&(lululemon_df['followers'] <= 5000)]
adidas_df_low_follower = adidas_df[(adidas_df['followers'] >= 4300)&(adidas_df['followers'] <= 5000)]
nike_df_low_follower = nike_df[(nike_df['followers'] >= 4300)&(nike_df['followers'] <= 5000)]
lululemon_low_follower_tweets = lululemon_df_low_follower['tweet'].values
adidas_low_follower_tweets = adidas_df_low_follower['tweet'].values
nike_low_follower_tweets = nike_df_low_follower['tweet'].values
lululemon_vocab_df = pd.DataFrame(list(token_counts(list(lululemon_low_follower_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count']) #Adjective, Nouns, and Verbs Only
adidas_vocab_df = pd.DataFrame(list(token_counts(list(adidas_low_follower_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nike_vocab_df = pd.DataFrame(list(token_counts(list(nike_low_follower_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nt = Network('1200', '1600', directed=False, notebook=True, cdn_resources='remote')
Graph = nx.Graph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')
vocab_df = [lululemon_vocab_df, adidas_vocab_df, nike_vocab_df]
tweet_groups = ['lululemon', 'adidas', 'nike']
for i in range(len(vocab_df)):
temp = vocab_df[i].sort_values('count', ascending=False).head(150)
for j in range(len(temp)):
Graph.add_edge(temp.iloc[j]['word'].lower(), tweet_groups[i], weight=int(temp.iloc[j]['count']) // 100, title=int(temp.iloc[j]['count']))
nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/low_follower_words_pyvis.html")
print('Nodes:', Graph.order())
print('Edges:', Graph.size())
Nodes: 353
Edges: 439
Analysis
We can see that the top 150 words associated with each of the brands from tweets with a low follower count is noticeably different than previous semantic network graphs. Here, there is more overlap words between the three companies, which could indicate that there is less brand recognition for users with lower follower counts. Visually, it looks like Lululemon has the largest number of unshared words while Nike has the least.
Sentiment Analysis
We can also attempt to classify tweets by sentiment and take a small subset of each to analyze user and word networks.
# takes a while to run
lululemon_df['tweet_sentiment'] = lululemon_df['tweet'].apply(tweet_sentiment)
adidas_df['tweet_sentiment'] = adidas_df['tweet'].apply(tweet_sentiment)
nike_df['tweet_sentiment'] = nike_df['tweet'].apply(tweet_sentiment)
lululemon_df_high_sentiment = lululemon_df[lululemon_df['tweet_sentiment'] >= 0.95]
adidas_df_high_sentiment = adidas_df[adidas_df['tweet_sentiment'] >= 0.95]
nike_df_high_sentiment = nike_df[nike_df['tweet_sentiment'] >= 0.95]
lululemon_high_sentiment_tweets = lululemon_df_high_sentiment['tweet'].values
adidas_high_sentiment_tweets = adidas_df_high_sentiment['tweet'].values
nike_high_sentiment_tweets = nike_df_high_sentiment['tweet'].values
lululemon_vocab_df = pd.DataFrame(list(token_counts(list(lululemon_high_sentiment_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count']) #Adjective, Nouns, and Verbs Only
adidas_vocab_df = pd.DataFrame(list(token_counts(list(adidas_high_sentiment_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nike_vocab_df = pd.DataFrame(list(token_counts(list(nike_high_sentiment_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nt = Network('1200', '1600', directed=True, notebook=True, cdn_resources='remote')
Graph = nx.DiGraph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')
tweet_groups = [lululemon_high_sentiment_tweets, adidas_high_sentiment_tweets, nike_high_sentiment_tweets]
tweet_dest = ['lululemon','adidas','nike']
for lst in range(len(tweet_groups)):
for i in range(len(tweet_groups[lst])):
if Graph.has_edge(tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]):
w = Graph.edges[tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]]['title']
Graph.edges[tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]]['weight'] = ((w+1) //5) +1
Graph.edges[tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]]['title'] = w+1
else: Graph.add_edge(tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst], weight=1, title=1)
nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/high_sentiment_mention_pyvis.html")
print('Nodes:', Graph.order())
print('Edges:', Graph.size())
Nodes: 221
Edges: 223
Analysis
We can see that there is very little overlap between users who at (@) tweet with a high (positive) sentiment. Only Nike and Adidas share users, while Lululemon has no user’s in common between Nike and Adidas. Lululemon does have the overall least amount of users with a high sentiment tweet, but this is expected because Lululemon was the smallest subset of tweets in this dataset.
nt = Network('1200', '1600', directed=False, notebook=True, cdn_resources='remote')
Graph = nx.Graph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')
vocab_df = [lululemon_vocab_df, adidas_vocab_df, nike_vocab_df]
tweet_groups = ['lululemon', 'adidas', 'nike']
for i in range(len(vocab_df)):
temp = vocab_df[i].sort_values('count', ascending=False).head(150)
for j in range(len(temp)):
Graph.add_edge(temp.iloc[j]['word'].lower(), tweet_groups[i], weight=int(temp.iloc[j]['count']) // 100, title=int(temp.iloc[j]['count']))
nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/high_sentiment_words_pyvis.html")
print('Nodes:', Graph.order())
print('Edges:', Graph.size())
Nodes: 323
Edges: 430
Analysis
There are a fair amount of shared words from the positive sentiment tweets. Some words were either brand names or names, but most were positive descriptors. Between all three companies, there were shared words like ‘family’, ‘good’, and ‘amazing’, which was not seen in the previous semantic network graphs.
lululemon_df_low_sentiment = lululemon_df[lululemon_df['tweet_sentiment'] <= -0.92]
adidas_df_low_sentiment = adidas_df[adidas_df['tweet_sentiment'] <= -0.92]
nike_df_low_sentiment = nike_df[nike_df['tweet_sentiment'] <= -0.92]
lululemon_low_sentiment_tweets = lululemon_df_low_sentiment['tweet'].values
adidas_low_sentiment_tweets = adidas_df_low_sentiment['tweet'].values
nike_low_sentiment_tweets = nike_df_low_sentiment['tweet'].values
lululemon_vocab_df = pd.DataFrame(list(token_counts(list(lululemon_low_sentiment_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count']) #Adjective, Nouns, and Verbs Only
adidas_vocab_df = pd.DataFrame(list(token_counts(list(adidas_low_sentiment_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nike_vocab_df = pd.DataFrame(list(token_counts(list(nike_low_sentiment_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nt = Network('1200', '1600', directed=True, notebook=True, cdn_resources='remote')
Graph = nx.DiGraph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')
tweet_groups = [lululemon_low_sentiment_tweets, adidas_low_sentiment_tweets, nike_low_sentiment_tweets]
tweet_dest = ['lululemon','adidas','nike']
for lst in range(len(tweet_groups)):
for i in range(len(tweet_groups[lst])):
if Graph.has_edge(tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]):
w = Graph.edges[tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]]['title']
Graph.edges[tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]]['weight'] = ((w+1) //5) +1
Graph.edges[tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]]['title'] = w+1
else: Graph.add_edge(tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst], weight=1, title=1)
nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/low_sentiment_mention_pyvis.html")
print('Nodes:', Graph.order())
print('Edges:', Graph.size())
Nodes: 202
Edges: 203
Analysis
For the negative sentiment tweets, only Nike and Adidas shared users who at (@) tweeted them. Lululemon only had one user, while Nike took up almost all the unique users who at (@) tweeted them a very negative sentiment (bad) tweet.
nt = Network('1200', '1600', directed=False, notebook=True, cdn_resources='remote')
Graph = nx.Graph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')
vocab_df = [lululemon_vocab_df, adidas_vocab_df, nike_vocab_df]
tweet_groups = ['lululemon', 'adidas', 'nike']
for i in range(len(vocab_df)):
temp = vocab_df[i].sort_values('count', ascending=False).head(150)
for j in range(len(temp)):
Graph.add_edge(temp.iloc[j]['word'].lower(), tweet_groups[i], weight=int(temp.iloc[j]['count']) // 100, title=int(temp.iloc[j]['count']))
nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/low_sentiment_words_pyvis.html")
print('Nodes:', Graph.order())
print('Edges:', Graph.size())
Nodes: 260
Edges: 303
Analysis
For the negative sentiment tweets, there were still some shared words between companies. Interestingly enough, the only shared word between the three companies was ‘service,’ indicating that it was likely most users who tweeted with a negative sentiment were unhappy potentially about the service of the companies, which could potentially mean the customer service.