In this project, I work with Twitter API to analyze Donald Trump's tweets.

import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import zipfile
import json
from pprint import pprint

# Ensure that Pandas shows at least 280 characters in columns, so we can see full tweets
pd.set_option('max_colwidth', 280)

%matplotlib inline
plt.style.use('fivethirtyeight')
import seaborn as sns
sns.set()
sns.set_context("talk")
import re


Twitter provides an API for downloading tweet data in large batches. The tweepy package makes it fairly easy to use.

import tweepy

import json
key_file = 'keys.json'
# in in question 1):
with open(key_file) as f:


import tweepy
from tweepy import TweepError
import logging

try:
auth = tweepy.OAuthHandler(keys["consumer_key"], keys["consumer_secret"])
auth.set_access_token(keys["access_token"], keys["access_token_secret"])
api = tweepy.API(auth)
except TweepError as e:
logging.warning("There was a Tweepy error. Double check your API keys and try again.")
logging.warning(e)

Your username is: ColeStriler

from pathlib import Path
from pprint import pprint
import json

ds_tweets_save_path = "BerkeleyData_recent_tweets.json"
# times:
if not Path(ds_tweets_save_path).is_file():
# Getting as many recent tweets by @BerkeleyData as Twitter will let us have.
# We use tweet_mode='extended' so that Twitter gives us full 280 character tweets.
# This was a change introduced in September 2017.

# The tweepy Cursor API actually returns "sophisticated" Status objects but we
# will use the basic Python dictionaries stored in the _json field.
example_tweets = [t._json for t in tweepy.Cursor(api.user_timeline, id="BerkeleyData",
tweet_mode='extended').items()]

# Saving the tweets to a json file on disk for future analysis
with open(ds_tweets_save_path, "w") as f:
json.dump(example_tweets, f)

with open(ds_tweets_save_path, "r") as f:

def load_keys(path):

Args:
path (str): The path to your key file.  The file should
be in JSON format and look like this (but filled in):
{
}

Returns:
dict: A dictionary mapping key names (like "consumer_key") to
key values."""

#raise NotImplementedError()
with open(path) as f:
return keys

def download_recent_tweets_by_user(user_account_name, keys):

Args:
user_account_name (str): The name of the Twitter account
keys (dict): A Python dictionary with Twitter authentication
keys (strings), like this (but filled in):
{
}

Returns:
list: A list of Dictonary objects, each representing one tweet."""
import tweepy

#raise NotImplementedError()
try:
auth = tweepy.OAuthHandler(keys["consumer_key"], keys["consumer_secret"])
auth.set_access_token(keys["access_token"], keys["access_token_secret"])
api = tweepy.API(auth)
tweets = [t._json for t in tweepy.Cursor(api.user_timeline, id= user_account_name,
tweet_mode='extended').items()]
return tweets
except TweepError as e:
logging.warning("There was a Tweepy error. Double check your API keys and try again.")
logging.warning(e)

def save_tweets(tweets, path):
"""Saves a list of tweets to a file in the local filesystem.

This function makes no guarantee about the format of the saved
tweets, **except** that calling load_tweets(path) after
save_tweets(tweets, path) will produce the same list of tweets
and that only the file at the given path is used to store the
tweets.  (That means you can implement this function however

Args:
tweets (list): A list of tweet objects (of type Dictionary) to
be saved.
path (str): The place where the tweets will be saved.

Returns:
None"""
#raise NotImplementedError()
with open(path, "w") as f:
json.dump(tweets, f)

def load_tweets(path):
"""Loads tweets that have previously been saved.

will produce the same list of tweets.

Args:
path (str): The place where the tweets were be saved.

Returns:
list: A list of Dictionary objects, each representing one tweet."""

#raise NotImplementedError()
with open(path, "r") as f:
return tweets

def get_tweets_with_cache(user_account_name, keys_path):
"""Get recent tweets from one user, loading from a disk cache if available.

The first time you call this function, it will download tweets by
they'll load the tweets from a save file in your local filesystem.
All this is done using the functions you defined in the previous cell.
This has benefits and drawbacks that often appear when you cache data:

+: Using this function will prevent extraneous usage of the Twitter API.
+: You will get your data much faster after the first time it's called.
or because you screwed up something in the previous cell and your
tweets aren't what you wanted), you'll have to find the save file
(which will look like <something>_recent_tweets.pkl) and delete it.

Args:
user_account_name (str): The Twitter handle of a user, without the @.
keys_path (str): The path to a JSON keys file in your filesystem.
"""

#raise NotImplementedError()
if not Path(user_account_name + '_recent_tweets.pkl').is_file():
save_tweets(tweets, user_account_name + '_recent_tweets.pkl')
return tweets


Now we can obtain roughly the last 3000 tweets by the realdonaldtrump.

trump_tweets = get_tweets_with_cache("realdonaldtrump", key_file)

Number of tweets downloaded: 3213


We are limited to how many tweets we can download. In what month is the oldest tweet from Trump?

trump_tweets[-1]['created_at']

'Mon Oct 30 14:28:10 +0000 2017'
oldest_month = 'October'


I will use the fetch_and_cache utility to download the dataset.

# Download the dataset
from utils import fetch_and_cache
data_url = 'http://www.ds100.org/fa18/assets/datasets/old_trump_tweets.json.zip'
file_name = 'old_trump_tweets.json.zip'

dest_path = fetch_and_cache(data_url=data_url, file=file_name)
print(f'Located at {dest_path}')

Using version already downloaded: Thu Oct 25 03:06:43 2018
MD5 hash of file: b6e33874de91d1a40207cdf9f9b51a09
Located at data/old_trump_tweets.json.zip


Finally, I will load the tweets directly from the compressed file without decompressing it first.

my_zip = zipfile.ZipFile(dest_path, 'r')
with my_zip.open("old_trump_tweets.json", "r") as f:


As a dictionary we can also list the keys:

old_trump_tweets[0].keys()

dict_keys(['created_at', 'id', 'id_str', 'text', 'truncated', 'entities', 'extended_entities', 'source', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'is_quote_status', 'retweet_count', 'favorite_count', 'favorited', 'retweeted', 'possibly_sensitive', 'lang'])

Now I will merge the old_trump_tweets and the trump_tweets we downloaded from twitter into one giant list of tweets.

tweet_dict = {tweet["id"]:tweet for tweet in old_trump_tweets}
tweet_dict.update({tweet["id"]:tweet for tweet in trump_tweets})
all_tweets = list(tweet_dict.values())

assert len(all_tweets) > len(trump_tweets)
assert len(all_tweets) > len(old_trump_tweets)


Below I make a DataFrame called trump containing all the tweets stored in all_tweets. The index of the dataframe is the ID of each tweet (looks something like 907698529606541312). It has have these columns:

• time: The time the tweet was created encoded as a datetime object. (Use pd.to_datetime to encode the timestamp.)
• source: The source device of the tweet.
• text: The text of the tweet.
• retweet_count: The retweet count of the tweet.
pd.to_datetime(all_tweets[0]['created_at'])

Timestamp('2016-10-12 14:00:48')
time = [pd.to_datetime(tweet['created_at']) for tweet in all_tweets]
source = [tweet['source'] for tweet in all_tweets]
text = []
for tweet in all_tweets:
if "text" in tweet.keys():
text.append(tweet["text"])
else:
text.append(tweet["full_text"])
retweet_count = [tweet['retweet_count'] for tweet in all_tweets]
index = [tweet["id"] for tweet in all_tweets]
trump = pd.DataFrame(
data = {"time": time, "source": source, "text": text, "retweet_count": retweet_count},
index = index
)

time source text retweet_count
786201435486781440 2016-10-12 13:46:43 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> Very little pick-up by the dishonest media of incredible information provided by WikiLeaks. So dishonest! Rigged system! 22609
786189446274248704 2016-10-12 12:59:05 <a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a> Crooked Hillary Clinton likes to talk about the things she will do but she has been there for 30 years - why didn't she do them? 18329
786054986534969344 2016-10-12 04:04:47 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> Thank you Florida- a MOVEMENT that has never been seen before and will never be seen again. Lets get out &amp;… https://t.co/t9XM9wFDZI 18789
786007502639038464 2016-10-12 00:56:06 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> Join me Thursday in Florida &amp; Ohio!\nWest Palm Beach, FL at noon:\nhttps://t.co/jwbZnQhxg9\nCincinnati, OH this 7:30pm:\nhttps://t.co/5w2UhalPIx 7761

## Tweet Source Analysis¶

In the following questions, I am going to find out the charateristics of Trump tweets and the devices used for the tweets.

First let's examine the source field:

trump['source'].unique()

array([ '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
'<a href="http://instagram.com" rel="nofollow">Instagram</a>',
'<a href="https://periscope.tv" rel="nofollow">Periscope</a>'], dtype=object)
#Removing the HTML tags from the source field.
trump['source'] = trump["source"].str.replace('<[^>]*>', '')
trump['source'].unique()

array(['Twitter for iPhone', 'Twitter for Android', 'Twitter Web Client',
'Twitter Ads', 'Periscope'], dtype=object)
from datetime import datetime
ELEC_DATE = datetime(2016, 11, 8)
INAUG_DATE = datetime(2017, 1, 20)
assert set(trump[(trump['time'] > ELEC_DATE) & (trump['time'] < INAUG_DATE) ]['source'].unique()) == set(['Twitter Ads',


We can see in the following plot that there are two device types that are more commonly used

trump['source'].value_counts().plot(kind="bar")
plt.ylabel("Number of Tweets")
plt.show()


Is there a difference between his Tweet behavior across these devices? I will attempt to answer this question in my subsequent analysis.

First, I'll take a look at whether Trump's tweets from an Android come at different times than his tweets from an iPhone. Note that Twitter gives us his tweets in the UTC timezone (notice the +0000 in the first few tweets)

for t in trump_tweets[0:3]:
print(t['created_at'])

Thu Oct 25 01:43:41 +0000 2018
Wed Oct 24 23:54:32 +0000 2018
Wed Oct 24 18:55:42 +0000 2018


I'll convert the tweet times to US Eastern Time, the timezone of New York and Washington D.C., since those are the places we would expect the most tweet activity from Trump.

trump['est_time'] = (
trump['time'].dt.tz_localize("UTC") # Set initial timezone to UTC
.dt.tz_convert("EST") # Convert to Eastern Time
)

time source text retweet_count est_time
786204978629185536 2016-10-12 14:00:48 Twitter for iPhone PAY TO PLAY POLITICS. \n#CrookedHillary https://t.co/wjsl8ITVvk 24915 2016-10-12 09:00:48-05:00
786201435486781440 2016-10-12 13:46:43 Twitter for iPhone Very little pick-up by the dishonest media of incredible information provided by WikiLeaks. So dishonest! Rigged system! 22609 2016-10-12 08:46:43-05:00
786189446274248704 2016-10-12 12:59:05 Twitter for Android Crooked Hillary Clinton likes to talk about the things she will do but she has been there for 30 years - why didn't she do them? 18329 2016-10-12 07:59:05-05:00
786054986534969344 2016-10-12 04:04:47 Twitter for iPhone Thank you Florida- a MOVEMENT that has never been seen before and will never be seen again. Lets get out &amp;… https://t.co/t9XM9wFDZI 18789 2016-10-11 23:04:47-05:00
786007502639038464 2016-10-12 00:56:06 Twitter for iPhone Join me Thursday in Florida &amp; Ohio!\nWest Palm Beach, FL at noon:\nhttps://t.co/jwbZnQhxg9\nCincinnati, OH this 7:30pm:\nhttps://t.co/5w2UhalPIx 7761 2016-10-11 19:56:06-05:00

Adding a column called hour to the trump table which contains the hour of the day as floating point number computed by:

$$\text{hour} + \frac{\text{minute}}{60} + \frac{\text{second}}{60^2}$$

trump['hour'] = [(trump['est_time'][i].hour +
trump['est_time'][i].minute/60 +
trump['est_time'][i].second/60**2)
for i in range(len(trump))]
trump.columns

Index(['time', 'source', 'text', 'retweet_count', 'est_time', 'hour'], dtype='object')
assert np.isclose(trump.loc[690171032150237184]['hour'], 8.93639)


I will use this data along with the seaborn distplot function to examine the distribution over hours of the day in eastern time that trump tweets on each device for the 2 most commonly used devices.

iphone = trump[trump['source'] == 'Twitter for iPhone']
android = trump[trump['source'] == 'Twitter for Android']

plt.figure(figsize=(9, 6))
sns.distplot(iphone['hour'], label = 'iPhone', hist=False)
sns.distplot(android['hour'], label = 'Android', hist=False)
plt.ylabel('fraction')
plt.xticks( [0, 10, 20, 30])
plt.legend()
plt.show()


According to this Verge article, Donald Trump switched from an Android to an iPhone sometime in March 2017.

Below I create a figure identical to the figure from 4c, except I should show the results only from 2016.

During the campaign, it was theorized that Donald Trump's tweets from Android were written by him personally, and the tweets from iPhone were from his staff.

# Add a feature called 'year'
years = []
for i in range(len(trump)):
years.append(trump['est_time'][i].year)
trump['year_mine'] = years

iphone_16 = trump[(trump['source'] == 'Twitter for iPhone') & (trump['year_mine'] == 2016)]
android_16 = trump[(trump['source'] == 'Twitter for Android') & (trump['year_mine'] == 2016)]

plt.figure(figsize=(9, 6))
sns.distplot(iphone_16['hour'], label = 'iPhone', hist=False)
sns.distplot(android_16['hour'], label = 'Android', hist=False)
plt.ylabel('fraction')
plt.xticks( [0, 10, 20, 30])
plt.legend()
plt.show()

text = """My figure does not support this theory. As you can see
from the above graphy, the distribution of Android tweets is
bimodal and appears to be on a set schedule: strictly in the
morning, a few mid-day, and in the afternoon. Comparing this
figure with the figure in 4c, the shape of the distribution of
Android tweets does not change. This leads me to believe that
tweets done on an android were from staff members. The disribution
of iPhone tweets seems more random which leads me to believe these
were the ones from Trump."""
print(text)

My figure does not support this theory. As you can see
from the above graphy, the distribution of Android tweets is
bimodal and appears to be on a set schedule: strictly in the
morning, a few mid-day, and in the afternoon. Comparing this
figure with the figure in 4c, the shape of the distribution of
Android tweets does not change. This leads me to believe that
tweets done on an android were from staff members. The disribution
of iPhone tweets seems more random which leads me to believe these
were the ones from Trump.


Let's now look at which device he has used over the entire time period of this dataset.

To examine the distribution of dates we will convert the date to a fractional year that can be plotted as a distribution.

import datetime
def year_fraction(date):
start = datetime.date(date.year, 1, 1).toordinal()
year_length = datetime.date(date.year+1, 1, 1).toordinal() - start
return date.year + float(date.toordinal() - start) / year_length

trump['year'] = trump['time'].apply(year_fraction)


Now I use the sns.distplot to overlay the distributions of the 2 most frequently used web technologies over the years.

plt.figure(figsize=(9, 6))
sns.distplot([year_fraction(i) for i in iphone['time']], label = 'iPhone');
sns.distplot([year_fraction(i) for i in android['time']], label = 'Android');
plt.yticks([0.00, 0.25, 0.50, 0.75, 1.00, 1.25, 1.50])
plt.xticks([2016, 2017, 2018, 2019])
plt.legend(prop={'size': 20});
plt.show()


## Question 6: Sentiment Analysis¶

It turns out that we can use the words in Trump's tweets to calculate a measure of the sentiment of the tweet. For example, the sentence "I love America!" has positive sentiment, whereas the sentence "I hate taxes!" has a negative sentiment. In addition, some words have stronger positive / negative sentiment than others: "I love America." is more positive than "I like America."

We will use the VADER (Valence Aware Dictionary and sEntiment Reasoner) lexicon to analyze the sentiment of Trump's tweets. VADER is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in social media which is great for our usage.

The VADER lexicon gives the sentiment of individual words. See below:

print(''.join(open("vader_lexicon.txt").readlines()[:10]))

\$:	-1.5	0.80623	[-1, -1, -1, -1, -3, -1, -3, -1, -2, -1]
%)	-0.4	1.0198	[-1, 0, -1, 0, 0, -2, -1, 2, -1, 0]
%-)	-1.5	1.43178	[-2, 0, -2, -2, -1, 2, -2, -3, -2, -3]
&-:	-0.4	1.42829	[-3, -1, 0, 0, -1, -1, -1, 2, -1, 2]
&:	-0.7	0.64031	[0, -1, -1, -1, 1, -1, -1, -1, -1, -1]
( '}{' )	1.6	0.66332	[1, 2, 2, 1, 1, 2, 2, 1, 3, 1]
(%	-0.9	0.9434	[0, 0, 1, -1, -1, -1, -2, -2, -1, -2]
('-:	2.2	1.16619	[4, 1, 4, 3, 1, 2, 3, 1, 2, 1]
(':	2.3	0.9	[1, 3, 3, 2, 2, 4, 2, 3, 1, 2]
((-:	2.1	0.53852	[2, 2, 2, 1, 2, 3, 2, 2, 3, 2]



As you can see, the lexicon contains emojis too! The first column of the lexicon is the token, or the word itself. The second column is the polarity of the word, or how positive / negative it is.

index = pd.read_table('vader_lexicon.txt', header = None).iloc[:, 0]
sent = pd.DataFrame(data = polarity).set_index(index)
sent.index.name = None

assert isinstance(sent, pd.DataFrame)
assert sent.shape == (7517, 1)
assert list(sent.index[5000:5005]) == ['paranoids', 'pardon', 'pardoned', 'pardoning', 'pardons']
assert np.allclose(sent['polarity'].head(), [-1.5, -0.4, -1.5, -0.4, -0.7])


Now, I will use this lexicon to calculate the overall sentiment for each of Trump's tweets. Here's the basic idea:

1. For each tweet, find the sentiment of each word.
2. Calculate the sentiment of each tweet by taking the sum of the sentiments of its words.

First, I lowercase the text in the tweets since the lexicon is also lowercase. Then I set the text column of the trump DF to be the lowercased text of each tweet.

trump['text'] = trump['text'].str.lower()


Now, I get rid of punctuation since it'll cause us to fail to match words. Below I create a new column called no_punc in the trump DF to be the lowercased text of each tweet with all punctuation replaced by a single space. We consider punctuation characters to be any character that isn't a Unicode word character or a whitespace character.

punct_re = r'[^A-Za-z\d\s]'
trump['no_punc'] = trump['text'].str.replace(punct_re, " ")


Now, I convert the tweets into what's called a tidy format to make the sentiments easier to calculate. The index of the table is the IDs of the tweets, repeated once for every word in the tweet. It has two columns:

1. num: The location of the word in the tweet. For example, if the tweet was "i love america", then the location of the word "i" is 0, "love" is 1, and "america" is 2.
2. word: The individual words of each tweet.
tidy_format = trump['no_punc'].str.split(expand=True).stack().to_frame().reset_index(level = 1)
tidy_format.columns = ["num", "word"]

num word
786204978629185536 0 pay
786204978629185536 1 to
786204978629185536 2 play
786204978629185536 3 politics
786204978629185536 4 crookedhillary

Now that I have this table in the tidy format, it becomes much easier to find the sentiment of each tweet: we can join the table with the lexicon table.

Below I add a polarity column to the trump table. The polarity column contains the sum of the sentiment polarity of each word in the text of the tweet.

trump['polarity'] = (tidy_format.merge(sent, left_on = 'word',
right_index = True)['polarity']
.reset_index().groupby('index').sum())
trump['polarity'].replace(np.float64('nan'), 0, inplace = True)

print('Most negative tweets:')
print('\n  ', t)

Most negative tweets:

it is outrageous that poisonous synthetic heroin fentanyl comes pouring into the u.s. postal system from china. we can, and must, end this now! the senate should pass the stop act – and firmly stop this poison from killing our children and destroying our country. no more delay!

the rigged russian witch hunt goes on and on as the “originators and founders” of this scam continue to be fired and demoted for their corrupt and illegal activity. all credibility is gone from this terrible hoax, and much more will be lost as it proceeds. no collusion!

james comey is a proven leaker &amp; liar. virtually everyone in washington thought he should be fired for the terrible job he did-until he was, in fact, fired. he leaked classified information, for which he should be prosecuted. he lied to congress under oath. he is a weak and.....

this is an illegally brought rigged witch hunt run by people who are totally corrupt and/or conflicted. it was started and paid for by crooked hillary and the democrats. phony dossier, fisa disgrace and so many lying and dishonest people already fired. 17 angry dems? stay tuned!

where’s the collusion? they made up a phony crime called collusion, and when there was no collusion they say there was obstruction (of a phony crime that never existed). if you fight back or say anything bad about the rigged witch hunt, they scream obstruction!

print('Most positive tweets:')
print('\n  ', t)

Most positive tweets:

congratulations to patrick reed on his great and courageous masters win! when patrick had his amazing win at doral 5 years ago, people saw his great talent, and a bright future ahead. now he is the masters champion!

my supporters are the smartest, strongest, most hard working and most loyal that we have seen in our countries history. it is a beautiful thing to watch as we win elections and gather support from all over the country. as we get stronger, so does our country. best numbers ever!

thank you to all of my great supporters, really big progress being made. other countries wanting to fix crazy trade deals. economy is roaring. supreme court pick getting great reviews. new poll says trump, at over 90%, is the most popular republican in history of the party. wow!

thank you, @wvgovernor jim justice, for that warm introduction. tonight, it was my great honor to attend the “greenbrier classic – salute to service dinner” in west virginia! god bless our veterans. god bless america - and happy independence day to all! https://t.co/v35qvcn8m6

the republican party had a great night. tremendous voter energy and excitement, and all candidates are those who have a great chance of winning in november. the economy is sooo strong, and with nancy pelosi wanting to end the big tax cuts and raise taxes, why wouldn’t we win?


Now I will plot the distribution of tweet sentiments broken down by whether the text of the tweet contains nyt or fox.

plt.figure(figsize=(9, 6))
nyt = trump[trump['text'].str.contains('nyt', case=False)]
fox = trump[trump['text'].str.contains('fox', case=False)]
sns.distplot(nyt['polarity'], label = "tweet contains 'nyt'")
sns.distplot(fox['polarity'], label = "tweet contains 'fox'")
plt.legend()
plt.show()


Here I explore which words led to a greater average number of retweets. For example, at the time of this writing, Donald Trump has two tweets that contain the word 'oakland' (tweets 932570628451954688 and 1016609920031117312) with 36757 and 10286 retweets respectively, for an average of 23,521.5.

Goal: find the top 20 most retweeted words. Include only words that appear in at least 25 tweets.

top_20 = (tidy_format.groupby('word').filter(lambda x: len(x) >= 25)
.merge(trump, how='inner', right_index=True, left_index=True)
.groupby('word').agg({'retweet_count': 'mean'}).sort_values(by='retweet_count', ascending=False))
top_20 = top_20.iloc[0:20, :]

assert 'jong'     in top_20.index
assert 'try'     in top_20.index
assert 'kim' in top_20.index
assert 'un'    in top_20.index
assert 'maybe'    in top_20.index


Here's a bar chart of the results:

top_20['retweet_count'].sort_values().plot.barh(figsize=(10, 8));