import numpy as np
import random
import pandas as pd
import tweepy as tw

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import word_tokenize
import re
import glob
import os


            
              feb = ['2022-02-'+str(date) +'/2022-02-'+str(date) +"_" for date in range(21,29)]
march = ['2022-03-'+str(date).zfill(2) +'/2022-03-'+str(date).zfill(2) +"_" for date in range(1,32)]
april = ['2022-04-'+str(date).zfill(2) +'/2022-04-'+str(date).zfill(2) +"_" for date in range(1,16)]
datedate = feb + march + april

num_csv_pr_day = [22, 20, 48, 72, 72, 72, 78, 79, 7, 5, 6, 6, 5, 6, 5, 6, 5, 6, 6, 6, 5, 6, 5, 6, 5, 5, 6, 5, 6, 5, 5, 6, 5, 6, 5, 6, 6, 5, 6, 5, 5, 6, 5, 6, 5, 6, 5, 5, 6, 5, 6, 5, 6, 5]

ids = []

for i in range(len(datedate)):
    date_ids = []

    num_csv_picks = int(np.floor(num_csv_pr_day[i]/3))
    csv_picks = random.sample(range(1,num_csv_pr_day[i]), num_csv_picks)
    tweets_per_csv = int(np.floor(18000/num_csv_picks))

    for j in csv_picks:
        git_url = 'https://raw.githubusercontent.com/ehsanulhaq1/russo_ukraine_dataset/main/'+ datedate[i] + str(j) + '.csv'
        df = pd.read_csv(git_url, header=0)
        ids_temp = [item for sublist in list(df.values) for item in sublist][:tweets_per_csv]
        ids.append(ids_temp)

  
ids = [item for sublist in ids for item in sublist]

ids_df = pd.DataFrame(ids)
ids_df.to_csv("ALL_IDS.csv")


            
              consumer_key= 'insert consumer key here'
consumer_secret= 'insert consumer secret key here'
access_token= 'insert access token here'
access_token_secret= 'insert access token secret here'

auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)

ALL_IDS = pd.read_csv("ALL_IDS.csv")
ALL_IDS = ALL_IDS.iloc[: , 1:]
ALL_IDS.columns = ["ids"]
ALL_IDS.drop(ALL_IDS.tail(25).index,inplace=True)

#create pandas dataframe
#create pandas dataframe
tweets_df = pd.DataFrame(columns = ['text','tweet_type','tweet_id', 'username','parent_author',
                                        'created_utc', 'location','language', 'place'])

n=100
step_size = 100

#BIG LOOP
for i in range(0, len(ALL_IDS), step_size):
    batch = [item for sublist in list(ALL_IDS.values[i:n]) for item in sublist]
    n+=100

    # fetching the statuses from the API
    statuses = api.lookup_statuses(batch,tweet_mode="extended")
    
    #retrieve all tweet attributes
    for status in statuses:
        tweet_id = status.id
        username = status.user.screen_name
        
        if hasattr(status, "retweeted_status"):
            text = status.retweeted_status.full_text
            parent_author = status.retweeted_status.user.screen_name
            tweet_type = "retweet"
        else:
            text = status.full_text
            parent_author = status.in_reply_to_screen_name
            if parent_author != None:
                tweet_type = "reply"
            else:
                tweet_type = "original"
    
        created = status.created_at
        location = status.user.location
        language = status.lang
        place = status.place

        #gather tweet attributes
        tweet_attribitues = [text, tweet_type, tweet_id,username, parent_author, created, location, language,place]
        
        # Append to dataframe
        tweets_df.loc[len(tweets_df)] = tweet_attribitues

        if (len(tweets_df) % 25000 == 0):
            tweets_df.to_csv("{}_tester_data.csv".format(i))
            tweets_df = pd.DataFrame(columns = ['text','tweet_type','tweet_id', 'username','parent_author',
                                        'created_utc', 'location','language', 'place'])


            
              # concatenating all .csv-files in one

path = r'data_collection/'
all_files = glob.glob(os.path.join(path, "*.csv")) 

df_from_each_file = (pd.read_csv(f, engine='python') for f in all_files)
concatenated_df   = pd.concat(df_from_each_file, ignore_index=True)
concatenated_df.to_csv("ALL_DIRTY_TWEETS.csv")


            
              from deep_translator import GoogleTranslator

nltk.download('stopwords')
nltk.download('punkt')
stopword_vocab = stopwords.words('english')
ps = PorterStemmer()

# the following functions can be applied to a df with pandas apply function

# translate text to english
def translate_text(text, source='auto', target='en'):
    lang_dict = {'afrikaans': 'af', 'albanian': 'sq', 'amharic': 'am', 'arabic': 'ar', 'armenian': 'hy', 'azerbaijani': 'az', 'basque': 'eu', 'belarusian': 'be', 'bengali': 'bn', 'bosnian': 'bs', 'bulgarian': 'bg', 'catalan': 'ca', 'cebuano': 'ceb', 'chichewa': 'ny', 'chinese (simplified)': 'zh-CN', 'chinese (traditional)': 'zh-TW', 'corsican': 'co', 'croatian': 'hr', 'czech': 'cs', 'danish': 'da', 'dutch': 'nl', 'english': 'en', 'esperanto': 'eo', 'estonian': 'et', 'filipino': 'tl', 'finnish': 'fi', 'french': 'fr', 'frisian': 'fy', 'galician': 'gl', 'georgian': 'ka', 'german': 'de', 'greek': 'el', 'gujarati': 'gu', 'haitian creole': 'ht', 'hausa': 'ha', 'hawaiian': 'haw', 'hebrew': 'iw', 'hindi': 'hi', 'hmong': 'hmn', 'hungarian': 'hu', 'icelandic': 'is', 'igbo': 'ig', 'indonesian': 'id', 'irish': 'ga', 'italian': 'it', 'japanese': 'ja', 'javanese': 'jw', 'kannada': 'kn', 'kazakh': 'kk', 'khmer': 'km', 'kinyarwanda': 'rw', 'korean': 'ko', 'kurdish': 'ku', 'kyrgyz': 'ky', 'lao': 'lo', 'latin': 'la', 'latvian': 'lv', 'lithuanian': 'lt', 'luxembourgish': 'lb', 'macedonian': 'mk', 'malagasy': 'mg', 'malay': 'ms', 'malayalam': 'ml', 'maltese': 'mt', 'maori': 'mi', 'marathi': 'mr', 'mongolian': 'mn', 'myanmar': 'my', 'nepali': 'ne', 'norwegian': 'no', 'odia': 'or', 'pashto': 'ps', 'persian': 'fa', 'polish': 'pl', 'portuguese': 'pt', 'punjabi': 'pa', 'romanian': 'ro', 'russian': 'ru', 'samoan': 'sm', 'scots gaelic': 'gd', 'serbian': 'sr', 'sesotho': 'st', 'shona': 'sn', 'sindhi': 'sd', 'sinhala': 'si', 'slovak': 'sk', 'slovenian': 'sl', 'somali': 'so', 'spanish': 'es', 'sundanese': 'su', 'swahili': 'sw', 'swedish': 'sv', 'tajik': 'tg', 'tamil': 'ta', 'tatar': 'tt', 'telugu': 'te', 'thai': 'th', 'turkish': 'tr', 'turkmen': 'tk', 'ukrainian': 'uk', 'urdu': 'ur', 'uyghur': 'ug', 'uzbek': 'uz', 'vietnamese': 'vi', 'welsh': 'cy', 'xhosa': 'xh', 'yiddish': 'yi', 'yoruba': 'yo', 'zulu': 'zu'}
    
    if source not in lang_dict.values():
        return 'und'

    else:
        return GoogleTranslator(source=source, target=target).translate(text)

# lowercase all letters in text
def lowercase(txt):
    return txt.lower()

# removes emojis, links, special characters, hashtags
def remove_links_special_characters(txt):
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", txt).split()) 

# find all hashtags - might not be necessary if it is properly in data 
def find_hashtags(text):
    found_hashtags = re.findall(r'#\w+', text)
    return [tick[1:].capitalize() for tick in found_hashtags]

# tokenize the tweets + remove stopwords
def tokenize_text(text, stem=True):
    if stem:
        filtered = [ps.stem(w.lower()) for w in word_tokenize(text) 
                    if w.isalpha() and w.lower() not in stopword_vocab]
    else:
        filtered = [w.lower() for w in word_tokenize(text) 
                    if w.isalpha() and w.lower() not in stopword_vocab]

    return filtered


def clean_tweet(tweet, language):
    clean_tw = str(tweet)

    # finds the hashtags (need to be done in og language)
    try:
        hashtags = find_hashtags(clean_tw)
    except:
        hashtags = []

    # lowercase - before or after hashtag? 
    clean_tw = lowercase(clean_tw)

    # translate only if the language is not english
    if language != ('en'):
        clean_tw = translate_text(text=clean_tw, source=language, target='en')

        #translate hashtags
        try:
            hashtags = [translate_text(tag, language) for tag in hashtags]
        except:
            hashtags = []

    # clean up in text (remove links, special characters etc.)
    clean_tw = remove_links_special_characters(clean_tw)

    tokenized = tokenize_text(clean_tw, stem=False)
    tokenized_stemmed = tokenize_text(clean_tw, stem=True)
 
    return (clean_tw, hashtags, tokenized, tokenized_stemmed)


all_dirty_tweets_df = pd.read_csv('ALL_DIRTY_TWEETS.csv')
    
clean_results = all_dirty_tweets_df.apply(lambda x: clean_tweet(x['text'], x['language']), axis=1)
all_dirty_tweets_df[['translated_text','translated_hashtags','tokenized_text','tokenized_text_stemmed']] = pd.DataFrame(clean_results.tolist(), columns=['translated_text', 'translated_hashtags', 'tokenized_text', 'tokenized_text_stemmed'])

all_dirty_tweets_df.to_csv('ALL_TWEETS_CLEAN.csv')


            
              import datetime as dt
from datetime import datetime, timedelta
import pandas as pd
import numpy as np


            
              #reading data:
data_df = pd.read_csv("/work/data/ALL_CLEAN_TWEETS.csv",converters={'tokenized_text': eval,'tokenized_text_stemmed': eval,"translated_hashtags": eval })

# make timestamp to a datetime object
data_df["date_created"] = pd.to_datetime(data_df['created_utc'], errors='coerce').dt.normalize()
data_df["date"] = data_df["date_created"].dt.date

data_df["translated_text"] = data_df["translated_text"].astype(str)


            
              # run the following cells for 'ru', 'en' and 'uk'
volume_df = data_df.loc[data_df.language=='ru']['date_created'].dt.date.value_counts().sort_index().reset_index()


            
              #to list so we can put it directly into the html chart.js code
[date_obj.strftime('%Y-%m-%d') for date_obj in volume_df['index']]


            
              #to list so we can put it directly into the html chart.js code
volume_df.date_created.tolist()


            
              import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


            
              #reading data:
data_df = pd.read_csv("/work/data/ALL_CLEAN_TWEETS.csv",converters={'tokenized_text': eval,'tokenized_text_stemmed': eval,"translated_hashtags": eval })

# make timestamp to a datetime object
data_df["date_created"] = pd.to_datetime(data_df['created_utc'], errors='coerce').dt.normalize()
data_df["date"] = data_df["date_created"].dt.date

data_df["translated_text"] = data_df["translated_text"].astype(str)


            
              # we need the date, language and the text to do the dispersion plot
lexical_df = data_df.loc[data_df.language.isin(['en', 'ru', 'uk'])][['date', 'language', 'translated_text']].sort_index().reset_index()
lexical_df['translated_text'] = lexical_df['translated_text']

# we create our list of important wors
words = "belarus|loot|glory|vasylkiv|oil|economy|chernobyl| nuclear war"

# we find all the important words in the tweets
lexical_df['words'] = lexical_df.translated_text.apply(lambda x: re.findall(words, x))

# we explode the dataframe to have single important words per row
lexical_df = lexical_df.explode('words')
lexical_df = lexical_df[lexical_df['words'].notna()]
lexical_df.head()


            
              # plotting the dispersion plot
plt.figure(figsize=(22,12))
plot = sns.stripplot(x="date", y="words", data=lexical_df, size=8, marker="s", edgecolor="gray", hue="language")
plt.xticks(rotation = 'vertical')
plt.legend(loc = 2, bbox_to_anchor = (1,1))
plt.savefig('dispersion.png')
plt.show()


            
              import pandas as pd
from collections import Counter
import shifterator as sh
from datetime import datetime, timedelta


            
              #reading data:
data_df = pd.read_csv("/work/data/ALL_CLEAN_TWEETS.csv",converters={'tokenized_text': eval,'tokenized_text_stemmed': eval,"translated_hashtags": eval })

# make timestamp to a datetime object
data_df["date_created"] = pd.to_datetime(data_df['created_utc'], errors='coerce').dt.normalize()
data_df["date"] = data_df["date_created"].dt.date

data_df["translated_text"] = data_df["translated_text"].astype(str)


            
              #making different dataframes for ukranian tweets and russian tweets
lang_ru = data_df.loc[data_df.language=='ru']
lang_uk = data_df.loc[data_df.language=='uk']


            
              #explode the tokenized text columns to only contain single strings instead of lists
l_ru = lang_ru.tokenized_text.explode().to_list()
l_uk = lang_uk.tokenized_text.explode().to_list()


            
              p_ru = dict([(key, value/len(l_ru)) for key,value in Counter(l_ru).items()])
p_uk = dict([(key, value/len(l_uk)) for key,value in Counter(l_uk).items()])

all_tokens = set(p_ru.keys()).union(set(p_uk.keys()))
dp = dict([(token,p_ru.get(token,0) - p_uk.get(token,0)) for token in all_tokens])


            
              sorted(dp.items(),key= lambda x:x[1], reverse=True)[:10]


            
              h = dict([(token,lab_dict.get(token,np.nan)-5) for token in all_tokens])
dPhi = dict([(token, h[token] * dp[token]) for token in all_tokens if not np.isnan(h[token])])


            
              sorted(dPhi.items(),key= lambda x:np.abs(x[1]), reverse=True)[:11]


            
              sentiment_shift = sh.WeightedAvgShift(type2freq_1 = p_ru,
                     type2freq_2 = p_uk,
                       type2score_1 = lab_dict,
                   reference_value =5)

sentiment_shift.get_shift_graph(detailed=True,
                               system_names = ["Russian","Ukranian"])
plt.show()


            
              # this allows you to pull the scores from the shift graph (because we made ours in html)
print(sentiment_shift.get_shift_scores()['world']*100)

print(sentiment_shift.get_shift_component_sums())


            
              # min date range is the first 14 days of the war
min_date_range = pd.date_range(data_df.date.sort_index()[0], periods=14).date
# min date range is the last 14 days of the data
max_date_range = pd.date_range(data_df.date.sort_index().iloc[-1], periods=14).date


            
              min_date_df = data_df.loc[data_df.date.isin(min_date_range)]
max_date_df = data_df.loc[data_df.date.isin(max_date_range)]


            
              #explode text columns to only have a single word per row instead of list 
l_bef = min_date_df.tokenized_text.explode().to_list()
l_aft = max_date_df.tokenized_text.explode().to_list()


            
              p_bef = dict([(key, value/len(l_bef)) for key,value in Counter(l_bef).items()])
p_aft = dict([(key, value/len(l_aft)) for key,value in Counter(l_aft).items()])

all_tokens = set(p_bef.keys()).union(set(p_aft.keys()))
dp = dict([(token,p_bef.get(token,0) - p_aft.get(token,0)) for token in all_tokens])


            
              sorted(dp.items(),key= lambda x:x[1], reverse=True)[:10]


            
              h = dict([(token,lab_dict.get(token,np.nan)-5) for token in all_tokens])
dPhi = dict([(token, h[token] * dp[token]) for token in all_tokens if not np.isnan(h[token])])


            
              sorted(dPhi.items(),key= lambda x:np.abs(x[1]), reverse=True)[:11]


            
              sentiment_shift = sh.WeightedAvgShift(type2freq_1 = p_bef,
                     type2freq_2 = p_aft,
                       type2score_1 = lab_dict,
                   reference_value =5)

sentiment_shift.get_shift_graph(detailed=True,
                               system_names = ["First Two Weeks","Last Two Weeks"])
plt.savefig('wordshift_time.png')  
plt.show()


            
              # this allows you to pull the scores from the shift graph (because we made ours in html)
print(sentiment_shift.get_shift_scores()['crimes']*100)

print(sentiment_shift.get_shift_component_sums())


            
              # load all data
clean_text = pd.read_csv("/work/data/ALL_CLEAN_TWEETS.csv",converters={'tokenized_text': eval,'tokenized_text_stemmed': eval,"translated_hashtags": eval }, low_memory=False)
# subsample 50.000 data points randomly
clean_text = clean_text.sample(n=50000, random_state=1)


            
              all_lang = ['en', 'pt', 'ru', 'ja', 'es', 'ca', 'de', 'fr', 'th', 'fi', 'uk',
       'et', 'it', 'tr', 'und', 'pl', 'zh', 'nl', 'cs', 'ro', 'no', 'hi',
       'el', 'ht', 'fa', 'in', 'tl', 'ta', 'da', 'ar', 'sv', 'sr', 'sl',
       'lv']


            
              # function to create large documents of all words within one category (language or time period)

def large_document(lang):
    lang_list = clean_text.loc[clean_text['language'] == lang].tokenized_text.tolist()
    lang_list = list(itertools.chain(*lang_list))
    return lang_list


            
              # function to compute TF scores
def computeTF(doc):
    TF_dict = dict([(key, round(value/len(doc),6)) for key,value in Counter(doc).items()])
    return TF_dict


            
              # create large documents for all 32 languages
for lang in all_lang:
    globals() [f'{lang}_doc'] = large_document(lang)

all_docs = [en_doc, pt_doc, ru_doc, ja_doc, es_doc, ca_doc, de_doc, fr_doc, th_doc, fi_doc, uk_doc,
       et_doc, it_doc, tr_doc, und_doc, pl_doc, zh_doc, nl_doc, cs_doc, ro_doc, no_doc, hi_doc,
       el_doc, ht_doc, fa_doc, in_doc, tl_doc, ta_doc, da_doc, ar_doc, sv_doc, sr_doc, sl_doc,
       lv_doc]


            
              # compute TFs for all large documents
for doc, lang in zip(all_docs, all_lang):
    globals() [f'{lang}_TF'] = computeTF(doc)

all_TF = [en_TF, pt_TF, ru_TF, ja_TF, es_TF, ca_TF, de_TF, fr_TF, th_TF, fi_TF, uk_TF,
       et_TF, it_TF, tr_TF, und_TF, pl_TF, zh_TF, nl_TF, cs_TF, ro_TF, no_TF, hi_TF,
       el_TF, ht_TF, fa_TF, in_TF, tl_TF, ta_TF, da_TF, ar_TF, sv_TF, sr_TF, sl_TF,
       lv_TF]


            
              # Create corpora including the 32 large documents
corpora = {}

for lang in range(len(all_docs)):
    corpora[lang] = all_docs[lang]

corpora_list = []

for i in range(len(corpora)):
    corpora_list.append(corpora[i])

corpora_set = set([term for sublist in corpora_list for term in sublist])


            
              # compute IDF scores for all words in corpus
IDFs = {}
N = len(corpora)
for term in tqdm(corpora_set):
    # compute document frequency for each term:
    counter = 0
    for i in range(len(corpora)):
        if term in set(corpora[i]):
            counter += 1
        # Compute IDF for term
    #print(term)
    IDFs[term] = np.log(N / counter)


            
              # Create function for computing TF-IDF values
def computeTF_IDF(TF_dict, IDF_dict):
    tfidf_dict_lang = {}
    for key, val in TF_dict.items():
        tf = val
        idf = IDF_dict[key]
        tfidf_dict_lang[key] = round(tf*idf, 6)
    
    return tfidf_dict_lang


            
              # compute TF-IDF scores for all languages
for lang, tf in zip(all_lang, all_TF):
    globals() [f'{lang}_tf_idf'] = computeTF_IDF(tf, IDFs)


            
              # function for generating wordcloud
def generate_wordcloud(data, mask=None):
    cloud = WordCloud(scale=4,
                      max_words=150,
                      colormap='twilight',
                      mask=mask,
                      background_color='white',
                      ).generate_from_frequencies(data)
    return cloud


            
              # English wordcloud
en_mask = np.array(Image.open('masks/world_mask.png'))
wordcloud_en = generate_wordcloud(en_tf_idf, mask=en_mask)
wordcloud_en.to_file("english_wc.png")


            
              # Russian wordcloud
ru_mask = np.array(Image.open('masks/ru_mask.png'))
wordcloud_ru = generate_wordcloud(ru_tf_idf, mask=ru_mask)
wordcloud_ru.to_file("russian_wc.png")


            
              # Ukranian wordcloud
uk_mask = np.array(Image.open('masks/ukr_mask.png'))
wordcloud_uk = generate_wordcloud(uk_tf_idf, mask=uk_mask)
wordcloud_uk.to_file("ukranian_wc.png")


            
              clean_text_test = clean_text.copy()
clean_text_test['date_created'] = pd.to_datetime(clean_text_test['created_utc']).dt.normalize()
clean_text_test = clean_text_test.sort_values(by="date_created")


            
              #dataframe including first time period
start_date1 = pd.to_datetime('2022-02-21 00:00:00+0000')
end_date1 = pd.to_datetime('2022-03-10 00:00:00+0000')
mask1 = (clean_text_test['date_created'] >= start_date1) & (clean_text_test['date_created'] <= end_date1)

clean_text1 = clean_text_test.loc[mask1]

#dataframe including second time period
start_date2 = pd.to_datetime('2022-03-11 00:00:00+0000')
end_date2 = pd.to_datetime('2022-03-29 00:00:00+0000')
mask2 = (clean_text_test['date_created'] >= start_date2) & (clean_text_test['date_created'] <= end_date2)

clean_text2 = clean_text_test.loc[mask2]

#dataframe including third time period
start_date3 = pd.to_datetime('2022-03-30 00:00:00+0000')
end_date3 = pd.to_datetime('2022-04-15 00:00:00+0000')
mask3 = (clean_text_test['date_created'] >= start_date3) & (clean_text_test['date_created'] <= end_date3)

clean_text3 = clean_text_test.loc[mask3]


            
              # create large doc for all dates
time_large_doc1 = clean_text1.tokenized_text.tolist()
time_large_doc1 = list(itertools.chain(*time_large_doc1))

time_large_doc2 = clean_text2.tokenized_text.tolist()
time_large_doc2 = list(itertools.chain(*time_large_doc2))

time_large_doc3 = clean_text3.tokenized_text.tolist()
time_large_doc3 = list(itertools.chain(*time_large_doc3))

all_docs_time = [time_large_doc1, time_large_doc2, time_large_doc3]


            
              # compute TF
time_tf1 = computeTF(time_large_doc1)
time_tf2 = computeTF(time_large_doc2)
time_tf3 = computeTF(time_large_doc3)


            
              # Create corpora including the 32 large documents

corpora_time = {}

for period in range(3):
    corpora_time[period] = all_docs_time[period]

corpora_list_time = []

for i in range(len(corpora_time)):
    corpora_list_time.append(corpora_time[i])

corpora_set_time = set([term for sublist in corpora_list_time for term in sublist])


            
              IDFs_time = {}
N = len(corpora_time)
for term in tqdm(corpora_set_time):
    # compute document frequency for each term:
    counter = 0
    for i in range(len(corpora_time)):
        if term in set(corpora_time[i]):
            counter += 1
        # Compute IDF for term
    IDFs_time[term] = np.log(N / counter)


            
              tf_idf1 = computeTF_IDF(time_tf1, IDFs_time)
tf_idf2 = computeTF_IDF(time_tf2, IDFs_time)
tf_idf3 = computeTF_IDF(time_tf3, IDFs_time)


            
              # 21/2 - 10/3 wordcloud
wordcloud_period1 = generate_wordcloud(tf_idf1)
wordcloud_period1.to_file("21_02_to_10_03_wc.png")


            
              # 11/3 - 29/3 wordcloud
wordcloud_period2 = generate_wordcloud(tf_idf2)
wordcloud_period2.to_file("11_03_to_29_03_wc.png")


            
              # 30/3 - 15/4 wordcloud
wordcloud_period3 = generate_wordcloud(tf_idf3)
wordcloud_period3.to_file("30_03_to_15_04_wc.png")


            
              import pandas as pd


            
              # we need the date and hashtags
hashtag_df = data_df[['date', 'translated_hashtags']]
# remove tweets without hashtags
hashtag_df = hashtag_df[hashtag_df.translated_hashtags.astype(bool)]
hashtag_df.head(3)


            
              # we explode the dataframe to have single important words per row - this is because we want to find most frequently used hashtags
hashtag_df = hashtag_df.explode('translated_hashtags')
hashtag_value_counts = hashtag_df.translated_hashtags.value_counts()[:500]

top_k_hashtags = list(hashtag_value_counts.index)
counts = list(hashtag_value_counts)


            
              # remove undefined and y from hashtag list
top_k_hashtags = set(top_k_hashtags)-set(['und', 'y'])

# only use top k hashtags
hashtag_df = hashtag_df.loc[hashtag_df.translated_hashtags.isin(top_k_hashtags)]


            
              # we explode the dataframe to have single important words per row
hashtag_df = hashtag_df.explode('translated_hashtags')

# group by date and hashtags and count how many per hashtag per day
hashtag_df = hashtag_df.groupby(['date', 'translated_hashtags']).size().unstack(fill_value=0)

hashtag_df.reset_index()


            
              hashtag_df = hashtag_df.reset_index()

# melt the dataframe to only countain columns ['date', 'name', 'category', 'value'] (needed for js)
hashtag_df = hashtag_df.melt(id_vars=["date"], var_name="name", value_name="value")


            
              hashtag_df['category'] = 'hashtag'
hashtag_df = hashtag_df[['date', 'name', 'category', 'value']]
hashtag_df = hashtag_df.set_index('date')
hashtag_df.to_csv('data_hashtags.csv')
hashtag_df

# now save as csv file and copy over the data to file in d3js folder


            
              import matplotlib.cm as cm
import matplotlib as mpl

import networkx as nx
import netwulf as nw
from netwulf import visualize
from pyvis.network import Network
import community.community_louvain

from itertools import combinations


            
              # loading all clean tweets
all_tweets = pd.read_csv("/work/data/ALL_CLEAN_TWEETS.csv",converters={'tokenized_text': eval,'tokenized_text_stemmed': eval,"translated_hashtags": eval }, low_memory=False)


            
              # get both directed and undirected versions of the network graphs
def create_nx_graph(language,w_edgelist):
    G = nx.DiGraph()
    for index, value in w_edgelist.items():
        G.add_weighted_edges_from([(index[0],index[1],value)]) 
    H = G.to_undirected(reciprocal=False)
    H.remove_edges_from(list(nx.selfloop_edges(H)))
    H.remove_nodes_from(list(nx.isolates(H)))
    return G,H

def create_w_edgelist(df, language):
    w_edgelist = df.loc[df["language"]==language].groupby(["username", "parent_author"]).size()
    return w_edgelist


            
              # creating network graphs based on the Ukrainian and Russian discourse:
w_edgelist_uk = create_w_edgelist(all_tweets, "uk")
G_uk, H_uk = create_nx_graph("uk", w_edgelist_uk)

w_edgelist_ru = create_w_edgelist(all_tweets,"ru")
G_ru, H_ru = create_nx_graph("ru", w_edgelist_ru)


            
              # using the formula above, the random network function is created:

def create_random_network(H):
    L_edges = len(H.edges())
    N_nodes = len(H.nodes())
    p = L_edges/(N_nodes * (N_nodes-1)/2)
    
    R = nx.Graph()
    all_edge_combinations = list(combinations(range(N_nodes), 2))
    
    for edge in all_edge_combinations:
        if random.random() < p:
            R.add_edge(*edge)
    return R


            
              R_uk = create_random_network(H_uk)
R_ru = create_random_network(H_ru)


            
              # implementing the average clustering coefficient:

def compute_cluster_coeff(G):
    cluster_coefs = []

    for node in list(G.nodes()):
        
        # clustering coefficient is 0 if the node only has one neighbor (there are no singletons either)
        if len(list(G.neighbors(node))) == 1: 
            cluster_coefs.append(0)

        else: 
            L_i = 0
            neighbors = list(combinations(list(G.neighbors(node)), 2))
            for pair in neighbors:
                if G.has_edge(*pair):
                    L_i+=1

            k_i = G.degree(node)
            cluster_coefs.append(2*L_i/(k_i*(k_i-1)+1e-5)) #add small number to avoid division by zero
            
    return np.mean(cluster_coefs)


            
              H_uk_cc = compute_cluster_coeff(H_uk)
R_uk_cc = compute_cluster_coeff(R_uk)

H_ru_cc = compute_cluster_coeff(H_ru)
R_ru_cc = compute_cluster_coeff(R_ru)


            
              def compute_modularity(graph, partition):
   
    communities = list(set(partition.values()))  
    L = graph.number_of_edges()
    
    M = 0
    for c in communities:
        group_c = [k for k,v in partition.items() if v == c]
        subgraph_c = graph.subgraph(group_c)
        k_c = sum([graph.degree[node] for node in subgraph_c.nodes()])
        L_c = subgraph_c.number_of_edges()
        M += (L_c/L) - (k_c/(2*L))**2
        
    return M


            
              partition_uk = community.community_louvain.best_partition(H_uk)
partition_ru = community.community_louvain.best_partition(H_ru)

H_uk_mod = compute_modularity(H_uk, partition_uk)
H_ru_mod = compute_modularity(H_ru, partition_ru)

# to visualize the communities
for k, v in H_uk.nodes(data=True):
    v['group'] = partition_uk[k]
    

for k, v in H_ru.nodes(data=True):
    v['group'] = partition_ru[k]


            
              def config_model(G):
    
    # unpack the tuples in the edge list:
    unique_list = [item for t in list(G.edges()) for item in t]

    # shuffle stubs:
    random.shuffle(unique_list)

    # new set of edges:
    new_edges = list(zip([unique_list[index] for index in range(0, len(unique_list), 2)],
                         [unique_list[index] for index in range(1, len(unique_list), 2)]))

    # initialize new graph with created graphs
    G_new = nx.MultiGraph()
    G_new.add_edges_from(new_edges)
    
    return G_new


            
              def config_mod1000(H, partition):
    swaps = []
    for i in range(1000):

        T = H.copy()
        T_swap = config_model(T)
        swaps.append(T_swap)
    
    modularities = [compute_modularity(swap, partition) for swap in swaps]
    
    return modularities


            
              uk_mods = config_mod1000(H_uk, partition_uk)
ru_mods = config_mod1000(H_ru, partition_ru)


            
              # set up plotting parameters
def setup_mpl():
    mpl.rcParams["font.family"] = "Helvetica Neue"
    mpl.rcParams["lines.linewidth"] = 1

setup_mpl()


            
              fig, ax = plt.subplots(1,1, dpi=300, figsize=(8,3),tight_layout = True)
ax.hist(uk_mods, label="Random modularity", alpha=0.8)

    
ax.axvline(x=H_uk_mod, label="Original modularity", color="r", ls = "--")
ax.set_ylabel("Modularity")
plt.title("Ukraine: Original vs Random Modularity")
plt.legend(loc=10)
plt.savefig("uk_1000mods.png")
plt.show()


            
              fig, ax = plt.subplots(1,1, dpi=300, figsize=(8,3),tight_layout = True)
ax.hist(ru_mods, label="Random modularity", alpha=0.8)

    
ax.axvline(x=H_uk_mod, label="Original modularity", color="r", ls = "--")
ax.set_ylabel("Modularity")
plt.title("Russia: Original vs Random Modularity")
plt.legend(loc=10)
plt.savefig("ru_1000mods.png")
plt.show()


            
              subset_tweets = all_tweets.sample(100000)

w_edgelist_ru = create_w_edgelist(subset_tweets, "ru")
_, I_ru = create_nx_graph("ru",w_edgelist_ru)

w_edgelist_uk = create_w_edgelist(subset_tweets, "uk")
_, I_uk = create_nx_graph("uk",w_edgelist_uk)


            
              partition_I_ru = community.community_louvain.best_partition(I_ru)
partition_I_uk = community.community_louvain.best_partition(I_uk)


            
              # get text between a user and its parent author
def get_text(username,parent):
    return subset_tweets.loc[(subset_tweets["username"]==username)&
                         (subset_tweets["parent_author"]==parent)].translated_text.values[0]

# weighted edgelist as a dataframe with texts

def w_edgelist_df(language, w_edgelist):
    w_edgelist_df = w_edgelist.to_frame('size')
    w_edgelist_df = w_edgelist_df.reset_index() 
    w_edgelist_df['text'] = w_edgelist_df[['username','parent_author']].apply(lambda x: get_text(*x), axis=1)
    return w_edgelist_df


            
              w_edgelist_df_ru = w_edgelist_df("ru",w_edgelist_ru)
w_edgelist_df_uk = w_edgelist_df("uk",w_edgelist_uk)


            
              def create_interactive_graph(w_edgelist, w_edgelist_df, partition):
    #
    colors = ["#{:06x}".format(random.randint(0, 0xFFFFFF)) for i in range(len(set(partition.values())))]
    net = Network(notebook=True)
    net.height = '1000px'
    net.width = '1000px'

    for user in list(w_edgelist_df.username.unique()):
        net.add_nodes([user],
                         title=[w_edgelist_df.loc[w_edgelist_df["username"]==user].text.values[0]],
                        color=[colors[partition[user]] if user in partition.keys() else '#FFFFFF'])

    for parent in list(w_edgelist_df.parent_author.unique()):
        net.add_nodes([parent],
                        color=[colors[partition[parent]] if parent in partition.keys() else '#FFFFFF'])

    # when adding users as nodes in the pyvis network, some of the user-strings are altered, 
    # so we use a try/except statement (only a few nodes are missed)
    for index, value in w_edgelist.items():
        try:
            net.add_edges([(index[0],index[1],value)])
        except:
            continue
    return net


            
              net_uk = create_interactive_graph(w_edgelist_uk, w_edgelist_df_uk, partition_uk)
net_uk.show('uk_final.html')

net_ru = create_interactive_graph(w_edgelist_ru, w_edgelist_df_ru, partition_ru)
net_ru.show('ru_final.html')

Our Code

Website:¶

Github Repository:¶

Group Contributions¶

Explainer Notebook¶

Motivation¶

The Ukraine-Russia tweet data¶

Some of the million reasons why this data is interesting¶

What's in it for you?¶

Basic stats. Let's get to know our data.¶

Dataset¶

Tweet ids¶

Downloading the tweets¶

Cleaning the data¶

Understanding the volume volatility of the data¶

Tools, theory and analysis. From theory to insights.¶

Text analysis¶

Dispersion Plot of Important Words¶

Analyses of dispersion plot¶

Wordshift Graphs¶

Russian vs Ukranian Wordshift¶

Analysis of word shifts for Russian and Ukranian¶

First Two Weeks vs Last Two Weeks Wordshift¶

Analysis of Time Wordshifts¶

Word clouds using TF-IDF scores¶

Wordclouds filtered on language: English, Ukranian and Russian¶

Analysis of language wordclouds¶

English¶

Russian¶

Ukranian¶

Wordclouds filtered on time period: 21/2-10/3, 11/3-29/3 and 30/3-15/4¶

Analysis of time period wordclouds¶

Most Popular Hashtags Over Time¶

Analyses of dynamic bar chart¶

Twitter Networks¶

Creating the graphs¶

Random Networks as Null Models¶

Clustering: Compare clustering trends in the Networks and their random counterpart¶

Visualizing the Twitter networks and their random counterpart¶

Investigating Communities in the Twitter Networks¶

Modularity¶

Community Partitions¶

Statistical Evaluation of the Modularities¶

Community stats¶

Interactive Ukrainian and Russian Twitter Networks¶

Discussion. What worked and what didn't. And why.¶

Reduction and incompleteness of the dataset¶

Lack of location data¶

Skewed distribution of tweet¶

No significant results in dispersion plot¶

Keywords introduce bias¶

Building the Twitter networks¶