Important Note: Please click on the specific html files to see the weights (hover-able) and explore node names/words more in-depth.

Load Packages

import gzip
import json
import nltk

import glob
import os
import shutil
import json
import csv
import networkx as nx
import matplotlib.pyplot as plt
try:
  import pyvis
  from pyvis.network import Network
except:
  !pip install pyvis
  import pyvis
  from pyvis import Network
from time import sleep
import nltk
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()
import re
import shutil
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
import string
import itertools
punctuation = string.punctuation
stopwordsset = set(stopwords.words("english"))
stopwordsset.add('rt')
stopwordsset.add("'s")
from datetime import datetime
import pandas as pd

from IPython.core.display import display, HTML

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.





True

Load Data

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

Define Extract Functions

def extract_mention(tweet, mention):
  if 'user_mentions' in tweet['entities'].keys():
    t_mentions = tweet['entities']['user_mentions']
    for i in t_mentions:
      if (i['screen_name'].upper() == mention.upper()) or (i['name'].upper() == mention.upper()):
        return 1
  else: return 0

def filter_tweets(tweets, mention):
    for tweet in tweets:
        if isinstance(tweet, (bytes, str)):
            tweet = json.loads(tweet)
        if extract_mention(tweet, mention):
            yield (tweet['user']['screen_name'], mention, tweet)

# Taken from Week 4 Lecture Notebook
#Removing urls
def removeURL(text):
  result = re.sub(r"http\S+", "", text)
  result = re.sub(r"’", "", result) # more special characters not coded as punctuation
  result = re.sub(r"“", "", result)
  result = re.sub(r"”", "", result)
  result = re.sub(r"—", "", result)
  result = re.sub(r"…", "", result)
  return result

#removes useless words such as a, an, the
def stopWords(tokenizedtext):
  goodwords = []
  for aword in tokenizedtext:
    if aword not in stopwordsset:
      goodwords.append(aword)
  return goodwords

# feature reduction. taking words and getting their roots and graphing only the root words
def lemmatizer(tokenizedtext):
  lemmawords = []
  for aword in tokenizedtext:
    aword = wn.lemmatize(aword)
    lemmawords.append(aword)
  return lemmawords

#inputs a list of tokens and returns a list of unpunctuated tokens/words
def removePunctuation(tokenizedtext):
  nopunctwords = []
  for aword in tokenizedtext:
    if aword not in punctuation:
      nopunctwords.append(aword)
  cleanedwords = []
  for aword in nopunctwords:
    aword = aword.translate(str.maketrans('', '', string.punctuation))
    cleanedwords.append(aword)

  return cleanedwords

def removesinglewords(tokenizedtext):
  goodwords = []
  for a_feature in tokenizedtext:
    if len(a_feature) > 1:
      goodwords.append(a_feature)
  return goodwords

# Adapted from Week 4 Lab
def token_counts(tweets, tagger=nltk.tag.PerceptronTagger().tag, tokenizer=nltk.TweetTokenizer().tokenize, parts_of_speech=None):
    if parts_of_speech == None:
      parts_of_speech = []
    token_dict = {}
    for tweet in tweets:
      if isinstance(tweet, (bytes, str)):
            tweet = json.loads(tweet)
      if 'full_text' in tweet.keys():
        tweet_text = tweet['full_text']
      else: tweet_text = tweet['text']
      tweet_text = removeURL(tweet_text)
      token = tokenizer(tweet_text)
      token = stopWords(token)
      token = lemmatizer(token)
      token = removePunctuation(token)
      tags = tagger(token)
      if len(tags) == 0: continue
      if len(parts_of_speech) == 0:
        for i in tags:
          if i[0] in token_dict:
            token_dict[i[0]] += 1
          else:
            token_dict[i[0]] = 1
      for i in tags:
        if i == None: continue
        elif i[1] in parts_of_speech:
          if i[0] in token_dict:
            token_dict[i[0]] += 1
          else:
            token_dict[i[0]] = 1
    return token_dict

def tweet_sentiment(tweet, tokenizer=nltk.TweetTokenizer().tokenize):
  if 'full_text' in tweet.keys():
        tweet_text = tweet['full_text']
  else: tweet_text = tweet['text']
  tweet_text = removeURL(tweet_text)
  token = tokenizer(tweet_text)
  token = stopWords(token)
  token = lemmatizer(token)
  token = removePunctuation(token)
  sentence = ' '.join(token)
  sentim_analyzer = SentimentIntensityAnalyzer()
  scores = sentim_analyzer.polarity_scores(sentence)
  return scores['compound']

Extract Tweets with an @ Reference to Companies

with gzip.open('drive/MyDrive/nikelululemonadidas_tweets.jsonl.gz') as f:
  lululemon = list(filter_tweets(f, 'lululemon'))

with gzip.open('drive/MyDrive/nikelululemonadidas_tweets.jsonl.gz') as g:
  nike = list(filter_tweets(g, 'nike'))

with gzip.open('drive/MyDrive/nikelululemonadidas_tweets.jsonl.gz') as h:
  adidas = list(filter_tweets(h, 'adidas'))

print('Lululemon Tweet Mentions:', len(lululemon))
print('Nike Tweet Mentions:', len(nike))
print('Adidas Tweet Mentions:', len(adidas))

Lululemon Tweet Mentions: 6168
Nike Tweet Mentions: 118953
Adidas Tweet Mentions: 36485

Central Users

We can start by investigating the key users and their tweets. We will first create a subset of the top 100 users per segment.

nike_df = pd.DataFrame(nike, columns=['user','segment', 'tweet'])
adidas_df = pd.DataFrame(adidas, columns=['user','segment', 'tweet'])
lululemon_df = pd.DataFrame(lululemon, columns=['user','segment', 'tweet'])

nike_df['user_description'] = nike_df['tweet'].apply(lambda x: x['user']['description'])
adidas_df['user_description'] = lululemon_df['tweet'].apply(lambda x: x['user']['description'])
lululemon_df['user_description'] = lululemon_df['tweet'].apply(lambda x: x['user']['description'])

nike_df['user_description'] = nike_df['user_description'].fillna('N/A')
adidas_df['user_description'] = adidas_df['user_description'].fillna('N/A')
lululemon_df['user_description'] = lululemon_df['user_description'].fillna('N/A')

nike_df['user'].value_counts(ascending=False).rename_axis('user').reset_index(name='counts').head(20).merge(nike_df[['user','user_description']].groupby('user',as_index=False).max(), how='left', on='user')

	user	counts	user_description
0	SneakerScouts	6891	The #1 source for sneaker news, release dates,...
1	HUsBadGuys	4067	HU's Bad Guys #HBOW
2	Kaya_Alexander5	720	Just a girl who loves her sneakers. The sneake...
3	Stealth783	590
4	GirardisGod	530	My 2 Instagram accounts are (@girardisgodgram)...
5	ShockandAweEnt	361	Providing a broad range of entertainment aroun...
6	vadriano2000	334
7	turtlepace5	321	CT born, WI raised. Packers, Auntie, Coffee, S...
8	SSBrandon	278	@Nike Apostle & #SNKRS VET, who pledged allegi...
9	zen_masstah	271	SNKR head, hip hop, anti influencer, hater of ...
10	jadendaly	247	Please allow me to introduce myself: I’m a man...
11	DJBLUIZ	212	Dj/Sneakerhead 👟10.5-11 - Cowboys-Knicks-Devil...
12	levibrian86	207	chef, private sec., vocal singing,let u know ...
13	therealJCW	205	#SneakerScouts @SneakerScouts @ShockandAweEnt
14	efiorentino31	203	☀︎︎ ♍︎ ☽ ♓︎ ❥ sneakers & makeup ♥︎
15	Moonman989	177	#AJ1FAM
16	joshuajhan	171	🇰🇷 916 English Bulldogs Nike Jumpman sneakerhe...
17	beiberlove69	157	it's a big hat, it's funny\nsz13 crew
18	BoysRevelacion	137
19	DoBetterBB	136

adidas_df['user'].value_counts(ascending=False).rename_axis('user').reset_index(name='counts').head(20).merge(adidas_df[['user','user_description']].groupby('user',as_index=False).max(), how='left', on='user')

	user	counts	user_description
0	BoysRevelacion	137	test pilot for pies. no thoughts, no vibes. #b...
1	zen_masstah	123	N/A
2	bleustar9757	108	N/A
3	jajuanmharley	89	🏳️‍🌈Health educator and certified fitness trai...
4	turtlepace5	68	N/A
5	TheRealMGesta	68	🌎 #VoteBlue2022 #NeverGOP #WeVote,WeWin\nNo u...
6	restebanrf1993	67	Transport Writer and Editor En/Fr
7	KVSwitzer	53	likes = 🙄 / 🥰/ 🤣 / 🤬 / 🥴
8	GrossAmilee	39	N/A
9	natmmom	35	🇱🇧West Coast Phalange Supporter🇱🇧 Retvrn to Af...
10	wearekrimy	35	So Cal born and raised. Athletic Trainer, Phys...
11	Moonman989	34	N/A
12	erik102079	34	N/A
13	FootwearNews	33	photo:@linksgems
14	golacokits	31	N/A
15	kaflickinger74	30	seo \| snack enthusiast \| photography \| grandma...
16	josiethewonder	27	Photography📸 Cane Creek Distillery🥃
17	DeionPatterson1	27	N/A
18	BPrince95	26	welcome to the clown show
19	JamieGeorge93	25	N/A

lululemon_df['user'].value_counts(ascending=False).rename_axis('user').reset_index(name='counts').head(20).merge(lululemon_df[['user','user_description']].groupby('user',as_index=False).max(), how='left', on='user')

	user	counts	user_description
0	JimAceman	32	Mammal, Partner, Father, Son, Brother, Friend,...
1	MattMully	31	Be...excellent to eachother.
2	kinseyfit	24	“Fit & Fearless” - where we believe it’s not j...
3	WhatAndrewSaid	23	“so great looking and smart, a true Stable Gen...
4	MasashiKohmura	20	Cross-county skiing information specialized in...
5	liab9845	20
6	blythelia3505	16	$jillnauyokas
7	Chrisblythe9845	16	Food & Drink Funny Fashion Television Music Di...
8	DeezeFi	14	all I know is I don't know nothing \| lindy wal...
9	gomerland2	14	Survivor and researcher as I embark in a journ...
10	MrLeonardKim	14	I feel the pain in your heart and I know what ...
11	365yogadream	13	Ambassador @lululemon -Nutrition -Emergency Pr...
12	C_kelly1988	13	⚾️ 🌳 🥾 👟 size 11-12. Cincy sports fan. #teamcr...
13	MattTooze	13	Experienced ex athlete 800m 1.57.3. sub 16 5k ...
14	AFineBlogger	13	VP East 86th St Assoc, created https://t.co/Uz...
15	cloudwhiteNFT	12	financial philosopher. @axieinfinity evangelis...
16	lulunotify	11	The first monitor service for #lululemon drops...
17	TheSportsIndex	11	“THE bellwether stock market index for sports....
18	sean_broedow	11	I’m a runner, a 2021 Legacy Nuun Ambassador, 2...
19	aleman305	10	BJJ, MMA, CrossFit, and Olympic Weightlifting

Just looking at the top users for each company, we can see that users who at (@) tweet Nike are often sneakerheads, or people who either collect or buy lots of sneakers. Both Adidas and Lululemon’s users seem to be a more random assortment of people. Lululemon does, however, seem to be at (@) tweeted by a lot of fitness and health inclined people.

top_nike = nike_df['user'].value_counts(ascending=False).rename_axis('user').reset_index(name='counts').head(100)['user'].values
top_adidas = adidas_df['user'].value_counts(ascending=False).rename_axis('user').reset_index(name='counts').head(100)['user'].values
top_lululemon = lululemon_df['user'].value_counts(ascending=False).rename_axis('user').reset_index(name='counts').head(100)['user'].values

nt = Network('1200', '1600', directed=True, notebook=True, cdn_resources='remote')
Graph = nx.DiGraph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')
tweet_groups = [lululemon, adidas, nike]
tweet_users = [top_lululemon, top_adidas, top_nike]

for lst in range(len(tweet_groups)):
  for i in range(len(tweet_groups[lst])):
    if tweet_groups[lst][i][0] in tweet_users[lst]:
      if Graph.has_edge(tweet_groups[lst][i][0], tweet_groups[lst][i][1]):
        w = Graph.edges[tweet_groups[lst][i][0], tweet_groups[lst][i][1]]['title']
        Graph.edges[tweet_groups[lst][i][0], tweet_groups[lst][i][1]]['weight'] = ((w+1) //100)  +1
        Graph.edges[tweet_groups[lst][i][0], tweet_groups[lst][i][1]]['title'] = w+1
      else: Graph.add_edge(tweet_groups[lst][i][0], tweet_groups[lst][i][1], weight=1, title=1)

nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/user_mention_pyvis.html")

print('Nodes:', Graph.order())
print('Edges:', Graph.size())

Nodes: 285
Edges: 300

Analysis

Html link here!

We can see from the pyvis network graph that of the top 100 users for each company, the red cluster (Nike) and yellow cluster (Adidas) had more users in common than the blue cluster (Lululemon). This makes sense as both Nike and Adidas are primarily known for their shoes, while Lululemon is known for their clothing. Similarly, even when comparing clothing, which Nike and Adidas also manufacture, they primarily focus on workout clothing, while Lululemon is famous for it’s yoga attire.

Similarly, when looking at the number and size of node linkages, Nike has lots of users who continually at (@) tweet them. One user, SneakerScouts, tweeted almost 7,000 times within the dataset. Comparatively, Adidas and Lululemon did not have any followers who tweeted at (@) them that much.

There do seem to be a handful of users who tweet at both Nike and Adidas.

We can now investigate the key words these top users are using in their quote tweets.

lululemon_df_users = lululemon_df[lululemon_df['user'].isin(top_lululemon)]
adidas_df_users = adidas_df[adidas_df['user'].isin(top_adidas)]
nike_df_users = nike_df[nike_df['user'].isin(top_nike)]

lululemon_user_tweets = lululemon_df_users['tweet'].values
adidas_user_tweets = adidas_df_users['tweet'].values
nike_user_tweets = nike_df_users['tweet'].values

lululemon_vocab_df = pd.DataFrame(list(token_counts(list(lululemon_user_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count']) #Adjective, Nouns, and Verbs Only
adidas_vocab_df = pd.DataFrame(list(token_counts(list(adidas_user_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nike_vocab_df = pd.DataFrame(list(token_counts(list(nike_user_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])

nt = Network('1200', '1600', directed=False, notebook=True, cdn_resources='remote')
Graph = nx.Graph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')

vocab_df = [lululemon_vocab_df, adidas_vocab_df, nike_vocab_df]
tweet_groups = ['lululemon', 'adidas', 'nike']

for i in range(len(vocab_df)):
  temp = vocab_df[i].sort_values('count', ascending=False).head(150)
  for j in range(len(temp)):
    Graph.add_edge(temp.iloc[j]['word'].lower(), tweet_groups[i], weight=int(temp.iloc[j]['count']) // 100, title=int(temp.iloc[j]['count']))

nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/user_words_pyvis.html")

print('Nodes:', Graph.order())
print('Edges:', Graph.size())

Nodes: 386
Edges: 433

Analysis

Html link here!

We can see that Nike in particular is referenced much more in at (@) tweets than other companies. Interestingly enough, Nike’s at (@) tweets also had a higher frequency of ‘RT,’ or retweet, than the other companies. Nike’s at (@) tweets also had a higher frequency of ‘available,’ which could indicate that more people are asking if shoes or other Nike products are available for purchase.

Lululemon had more emojis as frequent words than other companies. Nike’s most frequent word, however, was a reference to it’s most frequent at (@) tweeter, which seems to be a sneaker new site or account.

Segmentation by Follower Count

Because the number of unique users from the entire subset of tweets is so large, we can split the twitter mentions graph into two sections: users with over 50,000 followers and users between 4,300 and 5,000. This was done to limit the number of nodes on a graph and provide a similar number of nodes between network graphs.

nt = Network('1200', '1600', directed=True, notebook=True, cdn_resources='remote')
Graph = nx.DiGraph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')
tweet_groups = [lululemon, adidas, nike]

for lst in tweet_groups:
  for i in range(len(lst)):
    if lst[i][2]['user']['followers_count'] >= 50000:
      if Graph.has_edge(lst[i][0], lst[i][1]):
        w = Graph.edges[lst[i][0], lst[i][1]]['title']
        Graph.edges[lst[i][0], lst[i][1]]['weight'] = ((w+1) //2)  +1
        Graph.edges[lst[i][0], lst[i][1]]['title'] = w+1
      else: Graph.add_edge(lst[i][0], lst[i][1], weight=1, title=1)

nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/high_follower_mention_pyvis.html")

print('Nodes:', Graph.order())
print('Edges:', Graph.size())

Nodes: 285
Edges: 300

Analysis

Html link here!

We can see that most of the high follower count users at (@) tweet Nike more than the other brands. Adidas is (@) tweeted less and Lululemon even less so. We can also see there is a similar distribution to the number of users who at (@) tweet multiple companies. Most users who at (@) tweet two companies will (@) tweet Nike and Adidas. There are a very few amount of users who at (@) tweet all three companies.

nt = Network('1200', '1600', directed=True, notebook=True, cdn_resources='remote')
Graph = nx.DiGraph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')
tweet_groups = [lululemon, adidas, nike]

for lst in tweet_groups:
  for i in range(len(lst)):
    if (lst[i][2]['user']['followers_count'] >= 4300) & (lst[i][2]['user']['followers_count'] <= 5000):
      if Graph.has_edge(lst[i][0], lst[i][1]):
        w = Graph.edges[lst[i][0], lst[i][1]]['title']
        Graph.edges[lst[i][0], lst[i][1]]['weight'] = ((w+1) //2)  +1
        Graph.edges[lst[i][0], lst[i][1]]['title'] = w+1
      else: Graph.add_edge(lst[i][0], lst[i][1], weight=1, title=1)

nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/low_follower_mention_pyvis.html")

print('Nodes:', Graph.order())
print('Edges:', Graph.size())

Nodes: 805
Edges: 869

Analysis

Html link here!

We can see that the distribution of at (@) tweets from lower follower tweeters is very similar to the high follower tweeters.

We can now investigate the semantic network graphs created from these two segments.

lululemon_df['followers'] = lululemon_df['tweet'].apply(lambda x: x['user']['followers_count'])
adidas_df['followers'] = adidas_df['tweet'].apply(lambda x: x['user']['followers_count'])
nike_df['followers'] = nike_df['tweet'].apply(lambda x: x['user']['followers_count'])

lululemon_df_high_follower = lululemon_df[lululemon_df['followers'] >= 50000]
adidas_df_high_follower = adidas_df[adidas_df['followers'] >= 50000]
nike_df_high_follower = nike_df[nike_df['followers'] >= 50000]

lululemon_high_follower_tweets = lululemon_df_high_follower['tweet'].values
adidas_high_follower_tweets = adidas_df_high_follower['tweet'].values
nike_high_follower_tweets = nike_df_high_follower['tweet'].values

lululemon_vocab_df = pd.DataFrame(list(token_counts(list(lululemon_high_follower_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count']) #Adjective, Nouns, and Verbs Only
adidas_vocab_df = pd.DataFrame(list(token_counts(list(adidas_high_follower_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nike_vocab_df = pd.DataFrame(list(token_counts(list(nike_high_follower_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])

nt = Network('1200', '1600', directed=False, notebook=True, cdn_resources='remote')
Graph = nx.Graph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')

vocab_df = [lululemon_vocab_df, adidas_vocab_df, nike_vocab_df]
tweet_groups = ['lululemon', 'adidas', 'nike']

for i in range(len(vocab_df)):
  temp = vocab_df[i].sort_values('count', ascending=False).head(150)
  for j in range(len(temp)):
    Graph.add_edge(temp.iloc[j]['word'].lower(), tweet_groups[i], weight=int(temp.iloc[j]['count']) // 100, title=int(temp.iloc[j]['count']))

nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/high_follower_words_pyvis.html")

print('Nodes:', Graph.order())
print('Edges:', Graph.size())

Nodes: 338
Edges: 436

Analysis

Html link here!

We can see that the top 150 words associated with each of the brands from tweets with a high follower count are distributed similarly to the previous semantic network graphs. Both Nike and Adidas have more word overlap than with Lululemon. There are a few words that all three companies have in common, but all words are fairly common on twitter and when discussing a company, including rt (re-tweet), store, and online.

Interestingly, a fair amount of overlap words from Nike and Adidas are other companies (xbox, starbucks, subway, etc.). Both Adidas and Lululemon have a handful of emoji’s as their most frequent words.

lululemon_df_low_follower = lululemon_df[(lululemon_df['followers'] >= 4300)&(lululemon_df['followers'] <= 5000)]
adidas_df_low_follower = adidas_df[(adidas_df['followers'] >= 4300)&(adidas_df['followers'] <= 5000)]
nike_df_low_follower = nike_df[(nike_df['followers'] >= 4300)&(nike_df['followers'] <= 5000)]

lululemon_low_follower_tweets = lululemon_df_low_follower['tweet'].values
adidas_low_follower_tweets = adidas_df_low_follower['tweet'].values
nike_low_follower_tweets = nike_df_low_follower['tweet'].values

lululemon_vocab_df = pd.DataFrame(list(token_counts(list(lululemon_low_follower_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count']) #Adjective, Nouns, and Verbs Only
adidas_vocab_df = pd.DataFrame(list(token_counts(list(adidas_low_follower_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nike_vocab_df = pd.DataFrame(list(token_counts(list(nike_low_follower_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])

nt = Network('1200', '1600', directed=False, notebook=True, cdn_resources='remote')
Graph = nx.Graph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')

vocab_df = [lululemon_vocab_df, adidas_vocab_df, nike_vocab_df]
tweet_groups = ['lululemon', 'adidas', 'nike']

for i in range(len(vocab_df)):
  temp = vocab_df[i].sort_values('count', ascending=False).head(150)
  for j in range(len(temp)):
    Graph.add_edge(temp.iloc[j]['word'].lower(), tweet_groups[i], weight=int(temp.iloc[j]['count']) // 100, title=int(temp.iloc[j]['count']))

nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/low_follower_words_pyvis.html")

print('Nodes:', Graph.order())
print('Edges:', Graph.size())

Nodes: 353
Edges: 439

Analysis

Html link here!

We can see that the top 150 words associated with each of the brands from tweets with a low follower count is noticeably different than previous semantic network graphs. Here, there is more overlap words between the three companies, which could indicate that there is less brand recognition for users with lower follower counts. Visually, it looks like Lululemon has the largest number of unshared words while Nike has the least.

Sentiment Analysis

We can also attempt to classify tweets by sentiment and take a small subset of each to analyze user and word networks.

# takes a while to run
lululemon_df['tweet_sentiment'] = lululemon_df['tweet'].apply(tweet_sentiment)
adidas_df['tweet_sentiment'] = adidas_df['tweet'].apply(tweet_sentiment)
nike_df['tweet_sentiment'] = nike_df['tweet'].apply(tweet_sentiment)

lululemon_df_high_sentiment = lululemon_df[lululemon_df['tweet_sentiment'] >= 0.95]
adidas_df_high_sentiment = adidas_df[adidas_df['tweet_sentiment'] >= 0.95]
nike_df_high_sentiment = nike_df[nike_df['tweet_sentiment'] >= 0.95]


lululemon_high_sentiment_tweets = lululemon_df_high_sentiment['tweet'].values
adidas_high_sentiment_tweets = adidas_df_high_sentiment['tweet'].values
nike_high_sentiment_tweets = nike_df_high_sentiment['tweet'].values

lululemon_vocab_df = pd.DataFrame(list(token_counts(list(lululemon_high_sentiment_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count']) #Adjective, Nouns, and Verbs Only
adidas_vocab_df = pd.DataFrame(list(token_counts(list(adidas_high_sentiment_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nike_vocab_df = pd.DataFrame(list(token_counts(list(nike_high_sentiment_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])

nt = Network('1200', '1600', directed=True, notebook=True, cdn_resources='remote')
Graph = nx.DiGraph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')
tweet_groups = [lululemon_high_sentiment_tweets, adidas_high_sentiment_tweets, nike_high_sentiment_tweets]
tweet_dest = ['lululemon','adidas','nike']

for lst in range(len(tweet_groups)):
  for i in range(len(tweet_groups[lst])):
    if Graph.has_edge(tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]):
      w = Graph.edges[tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]]['title']
      Graph.edges[tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]]['weight'] = ((w+1) //5)  +1
      Graph.edges[tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]]['title'] = w+1
    else: Graph.add_edge(tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst], weight=1, title=1)

nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/high_sentiment_mention_pyvis.html")

print('Nodes:', Graph.order())
print('Edges:', Graph.size())

Nodes: 221
Edges: 223

Analysis

Html link here!

We can see that there is very little overlap between users who at (@) tweet with a high (positive) sentiment. Only Nike and Adidas share users, while Lululemon has no user’s in common between Nike and Adidas. Lululemon does have the overall least amount of users with a high sentiment tweet, but this is expected because Lululemon was the smallest subset of tweets in this dataset.

nt = Network('1200', '1600', directed=False, notebook=True, cdn_resources='remote')
Graph = nx.Graph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')

vocab_df = [lululemon_vocab_df, adidas_vocab_df, nike_vocab_df]
tweet_groups = ['lululemon', 'adidas', 'nike']

for i in range(len(vocab_df)):
  temp = vocab_df[i].sort_values('count', ascending=False).head(150)
  for j in range(len(temp)):
    Graph.add_edge(temp.iloc[j]['word'].lower(), tweet_groups[i], weight=int(temp.iloc[j]['count']) // 100, title=int(temp.iloc[j]['count']))

nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/high_sentiment_words_pyvis.html")

print('Nodes:', Graph.order())
print('Edges:', Graph.size())

Nodes: 323
Edges: 430

Analysis

Html link here!

There are a fair amount of shared words from the positive sentiment tweets. Some words were either brand names or names, but most were positive descriptors. Between all three companies, there were shared words like ‘family’, ‘good’, and ‘amazing’, which was not seen in the previous semantic network graphs.

lululemon_df_low_sentiment = lululemon_df[lululemon_df['tweet_sentiment'] <= -0.92]
adidas_df_low_sentiment = adidas_df[adidas_df['tweet_sentiment'] <= -0.92]
nike_df_low_sentiment = nike_df[nike_df['tweet_sentiment'] <= -0.92]


lululemon_low_sentiment_tweets = lululemon_df_low_sentiment['tweet'].values
adidas_low_sentiment_tweets = adidas_df_low_sentiment['tweet'].values
nike_low_sentiment_tweets = nike_df_low_sentiment['tweet'].values

lululemon_vocab_df = pd.DataFrame(list(token_counts(list(lululemon_low_sentiment_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count']) #Adjective, Nouns, and Verbs Only
adidas_vocab_df = pd.DataFrame(list(token_counts(list(adidas_low_sentiment_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nike_vocab_df = pd.DataFrame(list(token_counts(list(nike_low_sentiment_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])

nt = Network('1200', '1600', directed=True, notebook=True, cdn_resources='remote')
Graph = nx.DiGraph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')
tweet_groups = [lululemon_low_sentiment_tweets, adidas_low_sentiment_tweets, nike_low_sentiment_tweets]
tweet_dest = ['lululemon','adidas','nike']

for lst in range(len(tweet_groups)):
  for i in range(len(tweet_groups[lst])):
    if Graph.has_edge(tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]):
      w = Graph.edges[tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]]['title']
      Graph.edges[tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]]['weight'] = ((w+1) //5)  +1
      Graph.edges[tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]]['title'] = w+1
    else: Graph.add_edge(tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst], weight=1, title=1)

nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/low_sentiment_mention_pyvis.html")

print('Nodes:', Graph.order())
print('Edges:', Graph.size())

Nodes: 202
Edges: 203

Analysis

Html link here!

For the negative sentiment tweets, only Nike and Adidas shared users who at (@) tweeted them. Lululemon only had one user, while Nike took up almost all the unique users who at (@) tweeted them a very negative sentiment (bad) tweet.

nt = Network('1200', '1600', directed=False, notebook=True, cdn_resources='remote')
Graph = nx.Graph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')

vocab_df = [lululemon_vocab_df, adidas_vocab_df, nike_vocab_df]
tweet_groups = ['lululemon', 'adidas', 'nike']

for i in range(len(vocab_df)):
  temp = vocab_df[i].sort_values('count', ascending=False).head(150)
  for j in range(len(temp)):
    Graph.add_edge(temp.iloc[j]['word'].lower(), tweet_groups[i], weight=int(temp.iloc[j]['count']) // 100, title=int(temp.iloc[j]['count']))

nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/low_sentiment_words_pyvis.html")

print('Nodes:', Graph.order())
print('Edges:', Graph.size())

Nodes: 260
Edges: 303

Analysis

Html link here!

For the negative sentiment tweets, there were still some shared words between companies. Interestingly enough, the only shared word between the three companies was ‘service,’ indicating that it was likely most users who tweeted with a negative sentiment were unhappy potentially about the service of the companies, which could potentially mean the customer service.