Important Note: Please click on the specific html files to see the weights (hover-able) and explore node names/words more in-depth.

Load Packages

import gzip
import json
import nltk

import glob
import os
import shutil
import json
import csv
import networkx as nx
import matplotlib.pyplot as plt
try:
  import pyvis
  from pyvis.network import Network
except:
  !pip install pyvis
  import pyvis
  from pyvis import Network
from time import sleep
import nltk
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()
import re
import shutil
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
import string
import itertools
punctuation = string.punctuation
stopwordsset = set(stopwords.words("english"))
stopwordsset.add('rt')
stopwordsset.add("'s")
from datetime import datetime
import pandas as pd

from IPython.core.display import display, HTML

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.





True

Load Data

from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive

Define Extract Functions

def extract_mention(tweet, mention):
  if 'user_mentions' in tweet['entities'].keys():
    t_mentions = tweet['entities']['user_mentions']
    for i in t_mentions:
      if (i['screen_name'].upper() == mention.upper()) or (i['name'].upper() == mention.upper()):
        return 1
  else: return 0

def filter_tweets(tweets, mention):
    for tweet in tweets:
        if isinstance(tweet, (bytes, str)):
            tweet = json.loads(tweet)
        if extract_mention(tweet, mention):
            yield (tweet['user']['screen_name'], mention, tweet)

# Taken from Week 4 Lecture Notebook
#Removing urls
def removeURL(text):
  result = re.sub(r"http\S+", "", text)
  result = re.sub(r"’", "", result) # more special characters not coded as punctuation
  result = re.sub(r"“", "", result)
  result = re.sub(r"”", "", result)
  result = re.sub(r"—", "", result)
  result = re.sub(r"…", "", result)
  return result

#removes useless words such as a, an, the
def stopWords(tokenizedtext):
  goodwords = []
  for aword in tokenizedtext:
    if aword not in stopwordsset:
      goodwords.append(aword)
  return goodwords

# feature reduction. taking words and getting their roots and graphing only the root words
def lemmatizer(tokenizedtext):
  lemmawords = []
  for aword in tokenizedtext:
    aword = wn.lemmatize(aword)
    lemmawords.append(aword)
  return lemmawords

#inputs a list of tokens and returns a list of unpunctuated tokens/words
def removePunctuation(tokenizedtext):
  nopunctwords = []
  for aword in tokenizedtext:
    if aword not in punctuation:
      nopunctwords.append(aword)
  cleanedwords = []
  for aword in nopunctwords:
    aword = aword.translate(str.maketrans('', '', string.punctuation))
    cleanedwords.append(aword)

  return cleanedwords

def removesinglewords(tokenizedtext):
  goodwords = []
  for a_feature in tokenizedtext:
    if len(a_feature) > 1:
      goodwords.append(a_feature)
  return goodwords

# Adapted from Week 4 Lab
def token_counts(tweets, tagger=nltk.tag.PerceptronTagger().tag, tokenizer=nltk.TweetTokenizer().tokenize, parts_of_speech=None):
    if parts_of_speech == None:
      parts_of_speech = []
    token_dict = {}
    for tweet in tweets:
      if isinstance(tweet, (bytes, str)):
            tweet = json.loads(tweet)
      if 'full_text' in tweet.keys():
        tweet_text = tweet['full_text']
      else: tweet_text = tweet['text']
      tweet_text = removeURL(tweet_text)
      token = tokenizer(tweet_text)
      token = stopWords(token)
      token = lemmatizer(token)
      token = removePunctuation(token)
      tags = tagger(token)
      if len(tags) == 0: continue
      if len(parts_of_speech) == 0:
        for i in tags:
          if i[0] in token_dict:
            token_dict[i[0]] += 1
          else:
            token_dict[i[0]] = 1
      for i in tags:
        if i == None: continue
        elif i[1] in parts_of_speech:
          if i[0] in token_dict:
            token_dict[i[0]] += 1
          else:
            token_dict[i[0]] = 1
    return token_dict

def tweet_sentiment(tweet, tokenizer=nltk.TweetTokenizer().tokenize):
  if 'full_text' in tweet.keys():
        tweet_text = tweet['full_text']
  else: tweet_text = tweet['text']
  tweet_text = removeURL(tweet_text)
  token = tokenizer(tweet_text)
  token = stopWords(token)
  token = lemmatizer(token)
  token = removePunctuation(token)
  sentence = ' '.join(token)
  sentim_analyzer = SentimentIntensityAnalyzer()
  scores = sentim_analyzer.polarity_scores(sentence)
  return scores['compound']

Extract Tweets with an @ Reference to Companies

with gzip.open('drive/MyDrive/nikelululemonadidas_tweets.jsonl.gz') as f:
  lululemon = list(filter_tweets(f, 'lululemon'))

with gzip.open('drive/MyDrive/nikelululemonadidas_tweets.jsonl.gz') as g:
  nike = list(filter_tweets(g, 'nike'))

with gzip.open('drive/MyDrive/nikelululemonadidas_tweets.jsonl.gz') as h:
  adidas = list(filter_tweets(h, 'adidas'))
print('Lululemon Tweet Mentions:', len(lululemon))
print('Nike Tweet Mentions:', len(nike))
print('Adidas Tweet Mentions:', len(adidas))
Lululemon Tweet Mentions: 6168
Nike Tweet Mentions: 118953
Adidas Tweet Mentions: 36485

Central Users

We can start by investigating the key users and their tweets. We will first create a subset of the top 100 users per segment.

nike_df = pd.DataFrame(nike, columns=['user','segment', 'tweet'])
adidas_df = pd.DataFrame(adidas, columns=['user','segment', 'tweet'])
lululemon_df = pd.DataFrame(lululemon, columns=['user','segment', 'tweet'])
nike_df['user_description'] = nike_df['tweet'].apply(lambda x: x['user']['description'])
adidas_df['user_description'] = lululemon_df['tweet'].apply(lambda x: x['user']['description'])
lululemon_df['user_description'] = lululemon_df['tweet'].apply(lambda x: x['user']['description'])

nike_df['user_description'] = nike_df['user_description'].fillna('N/A')
adidas_df['user_description'] = adidas_df['user_description'].fillna('N/A')
lululemon_df['user_description'] = lululemon_df['user_description'].fillna('N/A')
nike_df['user'].value_counts(ascending=False).rename_axis('user').reset_index(name='counts').head(20).merge(nike_df[['user','user_description']].groupby('user',as_index=False).max(), how='left', on='user')
user counts user_description
0 SneakerScouts 6891 The #1 source for sneaker news, release dates,...
1 HUsBadGuys 4067 HU's Bad Guys #HBOW
2 Kaya_Alexander5 720 Just a girl who loves her sneakers. The sneake...
3 Stealth783 590
4 GirardisGod 530 My 2 Instagram accounts are (@girardisgodgram)...
5 ShockandAweEnt 361 Providing a broad range of entertainment aroun...
6 vadriano2000 334
7 turtlepace5 321 CT born, WI raised. Packers, Auntie, Coffee, S...
8 SSBrandon 278 @Nike Apostle & #SNKRS VET, who pledged allegi...
9 zen_masstah 271 SNKR head, hip hop, anti influencer, hater of ...
10 jadendaly 247 Please allow me to introduce myself: I’m a man...
11 DJBLUIZ 212 Dj/Sneakerhead 👟10.5-11 - Cowboys-Knicks-Devil...
12 levibrian86 207 chef, private sec., vocal singing,let u know ...
13 therealJCW 205 #SneakerScouts @SneakerScouts @ShockandAweEnt
14 efiorentino31 203 ☀︎︎ ♍︎ ☽ ♓︎ ❥ sneakers & makeup ♥︎
15 Moonman989 177 #AJ1FAM
16 joshuajhan 171 🇰🇷 916 English Bulldogs Nike Jumpman sneakerhe...
17 beiberlove69 157 it's a big hat, it's funny\nsz13 crew
18 BoysRevelacion 137
19 DoBetterBB 136
adidas_df['user'].value_counts(ascending=False).rename_axis('user').reset_index(name='counts').head(20).merge(adidas_df[['user','user_description']].groupby('user',as_index=False).max(), how='left', on='user')
user counts user_description
0 BoysRevelacion 137 test pilot for pies. no thoughts, no vibes. #b...
1 zen_masstah 123 N/A
2 bleustar9757 108 N/A
3 jajuanmharley 89 🏳️‍🌈Health educator and certified fitness trai...
4 turtlepace5 68 N/A
5 TheRealMGesta 68 🌎 #VoteBlue2022 #NeverGOP #WeVote,WeWin\nNo u...
6 restebanrf1993 67 Transport Writer and Editor En/Fr
7 KVSwitzer 53 likes = 🙄 / 🥰/ 🤣 / 🤬 / 🥴
8 GrossAmilee 39 N/A
9 natmmom 35 🇱🇧West Coast Phalange Supporter🇱🇧 Retvrn to Af...
10 wearekrimy 35 So Cal born and raised. Athletic Trainer, Phys...
11 Moonman989 34 N/A
12 erik102079 34 N/A
13 FootwearNews 33 photo:@linksgems
14 golacokits 31 N/A
15 kaflickinger74 30 seo | snack enthusiast | photography | grandma...
16 josiethewonder 27 Photography📸 Cane Creek Distillery🥃
17 DeionPatterson1 27 N/A
18 BPrince95 26 welcome to the clown show
19 JamieGeorge93 25 N/A
lululemon_df['user'].value_counts(ascending=False).rename_axis('user').reset_index(name='counts').head(20).merge(lululemon_df[['user','user_description']].groupby('user',as_index=False).max(), how='left', on='user')
user counts user_description
0 JimAceman 32 Mammal, Partner, Father, Son, Brother, Friend,...
1 MattMully 31 Be...excellent to eachother.
2 kinseyfit 24 “Fit & Fearless” - where we believe it’s not j...
3 WhatAndrewSaid 23 “so great looking and smart, a true Stable Gen...
4 MasashiKohmura 20 Cross-county skiing information specialized in...
5 liab9845 20
6 blythelia3505 16 $jillnauyokas
7 Chrisblythe9845 16 Food & Drink Funny Fashion Television Music Di...
8 DeezeFi 14 all I know is I don't know nothing | lindy wal...
9 gomerland2 14 Survivor and researcher as I embark in a journ...
10 MrLeonardKim 14 I feel the pain in your heart and I know what ...
11 365yogadream 13 Ambassador @lululemon -Nutrition -Emergency Pr...
12 C_kelly1988 13 ⚾️ 🌳 🥾 👟 size 11-12. Cincy sports fan. #teamcr...
13 MattTooze 13 Experienced ex athlete 800m 1.57.3. sub 16 5k ...
14 AFineBlogger 13 VP East 86th St Assoc, created https://t.co/Uz...
15 cloudwhiteNFT 12 financial philosopher. @axieinfinity evangelis...
16 lulunotify 11 The first monitor service for #lululemon drops...
17 TheSportsIndex 11 “THE bellwether stock market index for sports....
18 sean_broedow 11 I’m a runner, a 2021 Legacy Nuun Ambassador, 2...
19 aleman305 10 BJJ, MMA, CrossFit, and Olympic Weightlifting

Just looking at the top users for each company, we can see that users who at (@) tweet Nike are often sneakerheads, or people who either collect or buy lots of sneakers. Both Adidas and Lululemon’s users seem to be a more random assortment of people. Lululemon does, however, seem to be at (@) tweeted by a lot of fitness and health inclined people.

top_nike = nike_df['user'].value_counts(ascending=False).rename_axis('user').reset_index(name='counts').head(100)['user'].values
top_adidas = adidas_df['user'].value_counts(ascending=False).rename_axis('user').reset_index(name='counts').head(100)['user'].values
top_lululemon = lululemon_df['user'].value_counts(ascending=False).rename_axis('user').reset_index(name='counts').head(100)['user'].values
nt = Network('1200', '1600', directed=True, notebook=True, cdn_resources='remote')
Graph = nx.DiGraph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')
tweet_groups = [lululemon, adidas, nike]
tweet_users = [top_lululemon, top_adidas, top_nike]

for lst in range(len(tweet_groups)):
  for i in range(len(tweet_groups[lst])):
    if tweet_groups[lst][i][0] in tweet_users[lst]:
      if Graph.has_edge(tweet_groups[lst][i][0], tweet_groups[lst][i][1]):
        w = Graph.edges[tweet_groups[lst][i][0], tweet_groups[lst][i][1]]['title']
        Graph.edges[tweet_groups[lst][i][0], tweet_groups[lst][i][1]]['weight'] = ((w+1) //100)  +1
        Graph.edges[tweet_groups[lst][i][0], tweet_groups[lst][i][1]]['title'] = w+1
      else: Graph.add_edge(tweet_groups[lst][i][0], tweet_groups[lst][i][1], weight=1, title=1)

nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/user_mention_pyvis.html")
print('Nodes:', Graph.order())
print('Edges:', Graph.size())
Nodes: 285
Edges: 300

Analysis

Html link here!

We can see from the pyvis network graph that of the top 100 users for each company, the red cluster (Nike) and yellow cluster (Adidas) had more users in common than the blue cluster (Lululemon). This makes sense as both Nike and Adidas are primarily known for their shoes, while Lululemon is known for their clothing. Similarly, even when comparing clothing, which Nike and Adidas also manufacture, they primarily focus on workout clothing, while Lululemon is famous for it’s yoga attire.

Similarly, when looking at the number and size of node linkages, Nike has lots of users who continually at (@) tweet them. One user, SneakerScouts, tweeted almost 7,000 times within the dataset. Comparatively, Adidas and Lululemon did not have any followers who tweeted at (@) them that much.

There do seem to be a handful of users who tweet at both Nike and Adidas.

We can now investigate the key words these top users are using in their quote tweets.

lululemon_df_users = lululemon_df[lululemon_df['user'].isin(top_lululemon)]
adidas_df_users = adidas_df[adidas_df['user'].isin(top_adidas)]
nike_df_users = nike_df[nike_df['user'].isin(top_nike)]

lululemon_user_tweets = lululemon_df_users['tweet'].values
adidas_user_tweets = adidas_df_users['tweet'].values
nike_user_tweets = nike_df_users['tweet'].values

lululemon_vocab_df = pd.DataFrame(list(token_counts(list(lululemon_user_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count']) #Adjective, Nouns, and Verbs Only
adidas_vocab_df = pd.DataFrame(list(token_counts(list(adidas_user_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nike_vocab_df = pd.DataFrame(list(token_counts(list(nike_user_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nt = Network('1200', '1600', directed=False, notebook=True, cdn_resources='remote')
Graph = nx.Graph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')

vocab_df = [lululemon_vocab_df, adidas_vocab_df, nike_vocab_df]
tweet_groups = ['lululemon', 'adidas', 'nike']

for i in range(len(vocab_df)):
  temp = vocab_df[i].sort_values('count', ascending=False).head(150)
  for j in range(len(temp)):
    Graph.add_edge(temp.iloc[j]['word'].lower(), tweet_groups[i], weight=int(temp.iloc[j]['count']) // 100, title=int(temp.iloc[j]['count']))

nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/user_words_pyvis.html")
print('Nodes:', Graph.order())
print('Edges:', Graph.size())
Nodes: 386
Edges: 433

Analysis

Html link here!

We can see that Nike in particular is referenced much more in at (@) tweets than other companies. Interestingly enough, Nike’s at (@) tweets also had a higher frequency of ‘RT,’ or retweet, than the other companies. Nike’s at (@) tweets also had a higher frequency of ‘available,’ which could indicate that more people are asking if shoes or other Nike products are available for purchase.

Lululemon had more emojis as frequent words than other companies. Nike’s most frequent word, however, was a reference to it’s most frequent at (@) tweeter, which seems to be a sneaker new site or account.

Segmentation by Follower Count

Because the number of unique users from the entire subset of tweets is so large, we can split the twitter mentions graph into two sections: users with over 50,000 followers and users between 4,300 and 5,000. This was done to limit the number of nodes on a graph and provide a similar number of nodes between network graphs.

nt = Network('1200', '1600', directed=True, notebook=True, cdn_resources='remote')
Graph = nx.DiGraph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')
tweet_groups = [lululemon, adidas, nike]

for lst in tweet_groups:
  for i in range(len(lst)):
    if lst[i][2]['user']['followers_count'] >= 50000:
      if Graph.has_edge(lst[i][0], lst[i][1]):
        w = Graph.edges[lst[i][0], lst[i][1]]['title']
        Graph.edges[lst[i][0], lst[i][1]]['weight'] = ((w+1) //2)  +1
        Graph.edges[lst[i][0], lst[i][1]]['title'] = w+1
      else: Graph.add_edge(lst[i][0], lst[i][1], weight=1, title=1)

nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/high_follower_mention_pyvis.html")
print('Nodes:', Graph.order())
print('Edges:', Graph.size())
Nodes: 285
Edges: 300

Analysis

Html link here!

We can see that most of the high follower count users at (@) tweet Nike more than the other brands. Adidas is (@) tweeted less and Lululemon even less so. We can also see there is a similar distribution to the number of users who at (@) tweet multiple companies. Most users who at (@) tweet two companies will (@) tweet Nike and Adidas. There are a very few amount of users who at (@) tweet all three companies.

nt = Network('1200', '1600', directed=True, notebook=True, cdn_resources='remote')
Graph = nx.DiGraph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')
tweet_groups = [lululemon, adidas, nike]

for lst in tweet_groups:
  for i in range(len(lst)):
    if (lst[i][2]['user']['followers_count'] >= 4300) & (lst[i][2]['user']['followers_count'] <= 5000):
      if Graph.has_edge(lst[i][0], lst[i][1]):
        w = Graph.edges[lst[i][0], lst[i][1]]['title']
        Graph.edges[lst[i][0], lst[i][1]]['weight'] = ((w+1) //2)  +1
        Graph.edges[lst[i][0], lst[i][1]]['title'] = w+1
      else: Graph.add_edge(lst[i][0], lst[i][1], weight=1, title=1)

nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/low_follower_mention_pyvis.html")
print('Nodes:', Graph.order())
print('Edges:', Graph.size())
Nodes: 805
Edges: 869

Analysis

Html link here!

We can see that the distribution of at (@) tweets from lower follower tweeters is very similar to the high follower tweeters.

We can now investigate the semantic network graphs created from these two segments.

lululemon_df['followers'] = lululemon_df['tweet'].apply(lambda x: x['user']['followers_count'])
adidas_df['followers'] = adidas_df['tweet'].apply(lambda x: x['user']['followers_count'])
nike_df['followers'] = nike_df['tweet'].apply(lambda x: x['user']['followers_count'])
lululemon_df_high_follower = lululemon_df[lululemon_df['followers'] >= 50000]
adidas_df_high_follower = adidas_df[adidas_df['followers'] >= 50000]
nike_df_high_follower = nike_df[nike_df['followers'] >= 50000]

lululemon_high_follower_tweets = lululemon_df_high_follower['tweet'].values
adidas_high_follower_tweets = adidas_df_high_follower['tweet'].values
nike_high_follower_tweets = nike_df_high_follower['tweet'].values

lululemon_vocab_df = pd.DataFrame(list(token_counts(list(lululemon_high_follower_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count']) #Adjective, Nouns, and Verbs Only
adidas_vocab_df = pd.DataFrame(list(token_counts(list(adidas_high_follower_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nike_vocab_df = pd.DataFrame(list(token_counts(list(nike_high_follower_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nt = Network('1200', '1600', directed=False, notebook=True, cdn_resources='remote')
Graph = nx.Graph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')

vocab_df = [lululemon_vocab_df, adidas_vocab_df, nike_vocab_df]
tweet_groups = ['lululemon', 'adidas', 'nike']

for i in range(len(vocab_df)):
  temp = vocab_df[i].sort_values('count', ascending=False).head(150)
  for j in range(len(temp)):
    Graph.add_edge(temp.iloc[j]['word'].lower(), tweet_groups[i], weight=int(temp.iloc[j]['count']) // 100, title=int(temp.iloc[j]['count']))

nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/high_follower_words_pyvis.html")
print('Nodes:', Graph.order())
print('Edges:', Graph.size())
Nodes: 338
Edges: 436

Analysis

Html link here!

We can see that the top 150 words associated with each of the brands from tweets with a high follower count are distributed similarly to the previous semantic network graphs. Both Nike and Adidas have more word overlap than with Lululemon. There are a few words that all three companies have in common, but all words are fairly common on twitter and when discussing a company, including rt (re-tweet), store, and online.

Interestingly, a fair amount of overlap words from Nike and Adidas are other companies (xbox, starbucks, subway, etc.). Both Adidas and Lululemon have a handful of emoji’s as their most frequent words.

lululemon_df_low_follower = lululemon_df[(lululemon_df['followers'] >= 4300)&(lululemon_df['followers'] <= 5000)]
adidas_df_low_follower = adidas_df[(adidas_df['followers'] >= 4300)&(adidas_df['followers'] <= 5000)]
nike_df_low_follower = nike_df[(nike_df['followers'] >= 4300)&(nike_df['followers'] <= 5000)]

lululemon_low_follower_tweets = lululemon_df_low_follower['tweet'].values
adidas_low_follower_tweets = adidas_df_low_follower['tweet'].values
nike_low_follower_tweets = nike_df_low_follower['tweet'].values

lululemon_vocab_df = pd.DataFrame(list(token_counts(list(lululemon_low_follower_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count']) #Adjective, Nouns, and Verbs Only
adidas_vocab_df = pd.DataFrame(list(token_counts(list(adidas_low_follower_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nike_vocab_df = pd.DataFrame(list(token_counts(list(nike_low_follower_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nt = Network('1200', '1600', directed=False, notebook=True, cdn_resources='remote')
Graph = nx.Graph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')

vocab_df = [lululemon_vocab_df, adidas_vocab_df, nike_vocab_df]
tweet_groups = ['lululemon', 'adidas', 'nike']

for i in range(len(vocab_df)):
  temp = vocab_df[i].sort_values('count', ascending=False).head(150)
  for j in range(len(temp)):
    Graph.add_edge(temp.iloc[j]['word'].lower(), tweet_groups[i], weight=int(temp.iloc[j]['count']) // 100, title=int(temp.iloc[j]['count']))

nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/low_follower_words_pyvis.html")
print('Nodes:', Graph.order())
print('Edges:', Graph.size())
Nodes: 353
Edges: 439

Analysis

Html link here!

We can see that the top 150 words associated with each of the brands from tweets with a low follower count is noticeably different than previous semantic network graphs. Here, there is more overlap words between the three companies, which could indicate that there is less brand recognition for users with lower follower counts. Visually, it looks like Lululemon has the largest number of unshared words while Nike has the least.

Sentiment Analysis

We can also attempt to classify tweets by sentiment and take a small subset of each to analyze user and word networks.

# takes a while to run
lululemon_df['tweet_sentiment'] = lululemon_df['tweet'].apply(tweet_sentiment)
adidas_df['tweet_sentiment'] = adidas_df['tweet'].apply(tweet_sentiment)
nike_df['tweet_sentiment'] = nike_df['tweet'].apply(tweet_sentiment)
lululemon_df_high_sentiment = lululemon_df[lululemon_df['tweet_sentiment'] >= 0.95]
adidas_df_high_sentiment = adidas_df[adidas_df['tweet_sentiment'] >= 0.95]
nike_df_high_sentiment = nike_df[nike_df['tweet_sentiment'] >= 0.95]


lululemon_high_sentiment_tweets = lululemon_df_high_sentiment['tweet'].values
adidas_high_sentiment_tweets = adidas_df_high_sentiment['tweet'].values
nike_high_sentiment_tweets = nike_df_high_sentiment['tweet'].values

lululemon_vocab_df = pd.DataFrame(list(token_counts(list(lululemon_high_sentiment_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count']) #Adjective, Nouns, and Verbs Only
adidas_vocab_df = pd.DataFrame(list(token_counts(list(adidas_high_sentiment_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nike_vocab_df = pd.DataFrame(list(token_counts(list(nike_high_sentiment_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nt = Network('1200', '1600', directed=True, notebook=True, cdn_resources='remote')
Graph = nx.DiGraph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')
tweet_groups = [lululemon_high_sentiment_tweets, adidas_high_sentiment_tweets, nike_high_sentiment_tweets]
tweet_dest = ['lululemon','adidas','nike']

for lst in range(len(tweet_groups)):
  for i in range(len(tweet_groups[lst])):
    if Graph.has_edge(tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]):
      w = Graph.edges[tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]]['title']
      Graph.edges[tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]]['weight'] = ((w+1) //5)  +1
      Graph.edges[tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]]['title'] = w+1
    else: Graph.add_edge(tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst], weight=1, title=1)

nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/high_sentiment_mention_pyvis.html")
print('Nodes:', Graph.order())
print('Edges:', Graph.size())
Nodes: 221
Edges: 223

Analysis

Html link here!

We can see that there is very little overlap between users who at (@) tweet with a high (positive) sentiment. Only Nike and Adidas share users, while Lululemon has no user’s in common between Nike and Adidas. Lululemon does have the overall least amount of users with a high sentiment tweet, but this is expected because Lululemon was the smallest subset of tweets in this dataset.

nt = Network('1200', '1600', directed=False, notebook=True, cdn_resources='remote')
Graph = nx.Graph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')

vocab_df = [lululemon_vocab_df, adidas_vocab_df, nike_vocab_df]
tweet_groups = ['lululemon', 'adidas', 'nike']

for i in range(len(vocab_df)):
  temp = vocab_df[i].sort_values('count', ascending=False).head(150)
  for j in range(len(temp)):
    Graph.add_edge(temp.iloc[j]['word'].lower(), tweet_groups[i], weight=int(temp.iloc[j]['count']) // 100, title=int(temp.iloc[j]['count']))

nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/high_sentiment_words_pyvis.html")
print('Nodes:', Graph.order())
print('Edges:', Graph.size())
Nodes: 323
Edges: 430

Analysis

Html link here!

There are a fair amount of shared words from the positive sentiment tweets. Some words were either brand names or names, but most were positive descriptors. Between all three companies, there were shared words like ‘family’, ‘good’, and ‘amazing’, which was not seen in the previous semantic network graphs.

lululemon_df_low_sentiment = lululemon_df[lululemon_df['tweet_sentiment'] <= -0.92]
adidas_df_low_sentiment = adidas_df[adidas_df['tweet_sentiment'] <= -0.92]
nike_df_low_sentiment = nike_df[nike_df['tweet_sentiment'] <= -0.92]


lululemon_low_sentiment_tweets = lululemon_df_low_sentiment['tweet'].values
adidas_low_sentiment_tweets = adidas_df_low_sentiment['tweet'].values
nike_low_sentiment_tweets = nike_df_low_sentiment['tweet'].values

lululemon_vocab_df = pd.DataFrame(list(token_counts(list(lululemon_low_sentiment_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count']) #Adjective, Nouns, and Verbs Only
adidas_vocab_df = pd.DataFrame(list(token_counts(list(adidas_low_sentiment_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nike_vocab_df = pd.DataFrame(list(token_counts(list(nike_low_sentiment_tweets), parts_of_speech=['JJ','JJR','JJS','NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']).items()), columns=['word','count'])
nt = Network('1200', '1600', directed=True, notebook=True, cdn_resources='remote')
Graph = nx.DiGraph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')
tweet_groups = [lululemon_low_sentiment_tweets, adidas_low_sentiment_tweets, nike_low_sentiment_tweets]
tweet_dest = ['lululemon','adidas','nike']

for lst in range(len(tweet_groups)):
  for i in range(len(tweet_groups[lst])):
    if Graph.has_edge(tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]):
      w = Graph.edges[tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]]['title']
      Graph.edges[tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]]['weight'] = ((w+1) //5)  +1
      Graph.edges[tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst]]['title'] = w+1
    else: Graph.add_edge(tweet_groups[lst][i]['user']['screen_name'], tweet_dest[lst], weight=1, title=1)

nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/low_sentiment_mention_pyvis.html")
print('Nodes:', Graph.order())
print('Edges:', Graph.size())
Nodes: 202
Edges: 203

Analysis

Html link here!

For the negative sentiment tweets, only Nike and Adidas shared users who at (@) tweeted them. Lululemon only had one user, while Nike took up almost all the unique users who at (@) tweeted them a very negative sentiment (bad) tweet.

nt = Network('1200', '1600', directed=False, notebook=True, cdn_resources='remote')
Graph = nx.Graph()
Graph.add_node('lululemon', size=35, group=1, color='blue')
Graph.add_node('nike', size=35, group=2, color='red')
Graph.add_node('adidas', size=35, group=3, color='yellow')

vocab_df = [lululemon_vocab_df, adidas_vocab_df, nike_vocab_df]
tweet_groups = ['lululemon', 'adidas', 'nike']

for i in range(len(vocab_df)):
  temp = vocab_df[i].sort_values('count', ascending=False).head(150)
  for j in range(len(temp)):
    Graph.add_edge(temp.iloc[j]['word'].lower(), tweet_groups[i], weight=int(temp.iloc[j]['count']) // 100, title=int(temp.iloc[j]['count']))

nt.from_nx(Graph)
nt.show_buttons(filter_=['physics'])
nt.toggle_physics(False)
nt.show('example.html')
# display(HTML('example.html'))
nt.save_graph("drive/MyDrive/low_sentiment_words_pyvis.html")
print('Nodes:', Graph.order())
print('Edges:', Graph.size())
Nodes: 260
Edges: 303

Analysis

Html link here!

For the negative sentiment tweets, there were still some shared words between companies. Interestingly enough, the only shared word between the three companies was ‘service,’ indicating that it was likely most users who tweeted with a negative sentiment were unhappy potentially about the service of the companies, which could potentially mean the customer service.