from nltk.tokenize import word_tokenize
import numpy as np
from nltk.corpus import gutenberg as gt
import matplotlib.pyplot as plt
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from bs4 import BeautifulSoup
import requests
import re
import seaborn as sns
import pandas as pd


plt.style.use('seaborn')


def get_sentiment(word):
    try:
        synset = wn.synsets(word)[0].name()
        sent = swn.senti_synset(synset)
        # Overall sentiment score is positive score minus negative score.
        return sent.pos_score() - sent.neg_score()
    except:
        # If the word isn't found in the synset dictionary, assume neutral sentiment.
        return 0


for word in ['good', 'bad', 'happy', 'sad', 'terrible',
             'death', 'hate', 'poverty', 'misery', 'party']:
    print(word, get_sentiment(word))

good 0.5
bad -0.875
happy 0.875
sad -0.625
terrible -0.625
death 0.0
hate -0.25
poverty -0.625
misery -0.125
party 0.0


def get_text_from_url(url):
    try:
        page = requests.get(url)
    except:
        page = requests.get(url, headers={'User-Agent': ''})
    finally:
        soup = BeautifulSoup(page.content)
        return soup.get_text()


cind_url_la = 'https://www.pitt.edu/~dash/grimm021.html'
texterella = get_text_from_url(cind_url_la)

print(texterella[:500] + '\n\n[...]\n\n' + texterella[-500:])


def extract_story(text, story_start, story_end):
    # Given some text, extract the part between (and including) story_start and story_end
    start_i = text.find(story_start)
    end_i = text.find(story_end) + len(story_end)
#     text = re.sub(f'(.*)({story_start})', r'\2', text, flags=re.DOTALL)
#     text = re.sub(f'({story_end})(.*)', r'\1', text, flags=re.DOTALL)
    return text[start_i: end_i]


story_start = 'A rich man\'s wife'
story_end = 'they were punished \nwith blindness as long as they lived.'
texterella = extract_story(texterella, story_start, story_end)

print(texterella[:500] + '\n\n[...]\n\n' + texterella[-500:])

A rich man's wife became sick, and when she felt that her end was 
drawing near, she called her only daughter to her bedside and said, "Dear 
child, remain pious and good, and then our dear God will always protect 
you, and I will look down on you from heaven and be near you." With this 
she closed her eyes and died. 
The girl went out to her mother's grave every day and wept, and she 
remained pious and good. When winter came the snow spread a white cloth 
over the grave, and when the spring su

[...]

o share her good fortune. 
When the bridal couple walked into the church, the older sister walked on 
their right side and the younger on their left side, and the pigeons 
pecked out one eye from each of them. Afterwards, as they came out of the 
church, the older one was on the left side, and the younger one on the 
right side, and then the pigeons pecked out the other eye from each of 
them. And thus, for their wickedness and falsehood, they were punished 
with blindness as long as they lived.


cind_url_la = 'https://www.shortkidstories.com/story/cinderella-2/'
texterella = get_text_from_url(cind_url_la)
story_start = 'Cinderella’s mother died'
story_end = 'glad that he had found the glass slipper.'
texterella = extract_story(texterella, story_start, story_end)

print(texterella[:500] + '\n\n[...]\n\n' + texterella[-500:])

Cinderella’s mother died while she was a very little child, leaving her to the care of her father and her step-sisters, who were very much older than herself; for Cinderella’s father had been twice married, and her mother was his second wife. Now, Cinderella’s sisters did not love her, and were very unkind to her. As she grew older they made her work as a servant, and even sift the cinders; on which account they used to call her in mockery “Cinderella.” It was not her real name, but she became a

[...]

ed, the Fairy godmother suddenly entered the room, and placing her godchild’s hand in the Prince’s, said:“Take this young girl for your wife, Prince; she is good and patient, and as she has known how to submit to injustice meekly, she will know how to reign justly.”So Cinderella was married to the Prince in great state, and they lived together very happily. She forgave her sisters, and treated them always very kindly, and the Prince had great cause to be glad that he had found the glass slipper.


def text_to_df(text):
    words = word_tokenize(text)
    window_size = len(words) // 100
    df = pd.DataFrame()
    df['Percent'] = np.arange(100)
    df['Words'] = [ words[window_size*i: window_size*(i+1)] 
                   for i in range(100) ]
    df['Sentiment'] = df['Words'].apply(lambda window: 
                                        np.mean([ get_sentiment(word) for word in window ] ) )
    return df


cinderella_df = text_to_df(texterella)
cinderella_df.head()


pd.options.display.max_colwidth = 200
cinderella_df['Text'] = cinderella_df['Words'].apply(lambda x: ' '.join(x))


cinderella_df.sort_values('Sentiment').tail()[['Percent', 'Sentiment', 'Text']]


cinderella_df.sort_values('Sentiment').head()[['Percent', 'Sentiment', 'Text']]


def cinderella_f(x):
    if x < 20:
        return -1
    elif 20 <= x < 40:
        return (x-20)//4/4 - 1
    elif 40 <= x < 60:
        return 1-((x-60)/20)**2
    elif 60 <= x < 80:
        return -0.5
    elif 80 <= x < 100:
        return 1 / (99.5-x) - 0.5 - 1/(101-80)
    
vonnegart_df = cinderella_df[['Percent']]
vonnegart_df.loc[:, 'Sentiment'] = vonnegart_df['Percent'].apply(cinderella_f)
vonnegart_df.loc[:, 'Curve'] = 'Vonnegart Curve'
sns.relplot(x='Percent', y='Sentiment', kind='line', data=vonnegart_df)
plt.title('Vonnegart');


sns.relplot(x='Percent', y='Sentiment', kind='line', data=cinderella_df);


def sliding_window(x, window_size):
    ret = []
    for i in range(1, 101):
        pad_factor = int( i/100 * window_size )
        lower = max(0, i-window_size//2)
        upper = min(100, i+window_size//2)
#         lower = max(0, i-window_size+pad_factor)
#         upper = min(101, i+pad_factor)
        ret.append(np.mean(x[lower:upper]))
    return ret

windows_df = pd.DataFrame(columns=['Percent', 'Sentiment', 'Window Size', 'Curve'])

for window_size in [2, 5, 10, 25]:
    window_df = cinderella_df[['Percent']]
    window_df.loc[:, 'Sentiment'] = sliding_window(cinderella_df['Sentiment'], window_size)
    window_df.loc[:, 'Window Size'] = f'{window_size}%'
    window_df.loc[:, 'Curve'] = 'Sliding Window'
    
    windows_df = windows_df.append(window_df)
    
sns.relplot(x='Percent', y='Sentiment', col='Window Size', kind='line', data=windows_df);


def ewma(x, alpha):
    S = x[0]
    ret = [S]
    for x_i in x[1:]:
        S = alpha * S + (1-alpha) * x_i
        ret.append(S)
    return ret

ewmas_df = pd.DataFrame(columns=['Percent', 'Sentiment', 'Alpha', 'Curve'])

for alpha in [0.25, 0.5, 0.75, 0.9]:
    ewma_df = cinderella_df[['Percent']]
    ewma_df.loc[:, 'Sentiment'] = ewma(cinderella_df['Sentiment'], alpha)
    ewma_df.loc[:, 'Alpha'] = alpha
    ewma_df.loc[:, 'Curve'] = 'EWMA'
    
    ewmas_df = ewmas_df.append(ewma_df)
    
sns.relplot(x='Percent', y='Sentiment', col='Alpha', kind='line', data=ewmas_df);


window_df = windows_df[ windows_df['Window Size'] == '10%' ]
min_sent = window_df['Sentiment'].min()
max_sent = window_df['Sentiment'].max()
window_df.loc[:, 'Sentiment'] = window_df['Sentiment'].apply(lambda x: (x-min_sent) / (max_sent - min_sent) * 2 - 1)
cinderella_df_2 = window_df.append(vonnegart_df)

sns.relplot(x='Percent', y='Sentiment', hue='Curve', style='Curve', kind='line', data=cinderella_df_2);


def hamlet_f(x):
    return -0.5

hamlet_url = 'http://shakespeare.mit.edu/hamlet/full.html'
hamlet_text = get_text_from_url(hamlet_url)
words_words_words = extract_story(hamlet_text, 'ACT I', 'Go, bid the soldiers shoot.')

def kafka_f(x):
    if x < 20:
        return -0.5 - (x/20)**2
    else:
        return None

kafka_url = 'https://www.gutenberg.org/cache/epub/5200/pg5200.txt'
kafka_text = get_text_from_url(kafka_url)
kafka_text = extract_story(kafka_text, 'One morning', 'stretch out her young body.')

def man_in_hole_f(x):
    return np.cos(x*2*np.pi/100) * (0.5 + 0.5*x/100)

hobbit_url = 'https://archive.org/stream/TheHobbitByJRRTolkienEBOOK/The%20Hobbit%20byJ%20%20RR%20Tolkien%20EBOOK_djvu.txt'
hobbit_text = get_text_from_url(hobbit_url)
hobbit_text = extract_story(hobbit_text, 'Chapter I', 'handed him the tobacco-jar.')

def boy_girl_f(x):
    return np.sin(x*2.5*np.pi/100) * (0.5 + 0.5*x/100)

jane_eyre_url = "http://gutenberg.org/files/1260/1260-h/1260-h.htm"
jane_eyre_text = get_text_from_url(jane_eyre_url)
jane_eyre_text = extract_story(jane_eyre_text, 'CHAPTER I', 'Amen; even so come, Lord\r\nJesus!’”')

story_fs = [cinderella_f, hamlet_f, kafka_f, man_in_hole_f, boy_girl_f]
story_texts = [texterella, words_words_words, kafka_text, hobbit_text, jane_eyre_text]
story_names = ['Cinderella', 'Hamlet', 'Kafka', 'Man in Hole', 'Boy meets Girl']


storys_df = pd.DataFrame(columns=['Progress', 'Sentiment', 'Curve', 'Story'])
for f, story, story_name in zip(story_fs, story_texts, story_names):
    
    # Empirical plot
    story_df = text_to_df(story)
    story_df.loc[:, 'Sentiment'] = sliding_window(story_df['Sentiment'], 10)
    story_df.loc[:, 'Curve'] = 'Empirical'
    sent_min = story_df['Sentiment'].min()
    sent_max = story_df['Sentiment'].max()
    story_df.loc[:, 'Sentiment'] = story_df['Sentiment'].apply(lambda x: (x-sent_min) * 2 / (sent_max - sent_min) - 1)
    
    
    # Vonnegart plot
    vonnegart_df = story_df.copy()
    vonnegart_df.loc[:, 'Sentiment'] = vonnegart_df['Percent'].apply(f)
    vonnegart_df.loc[:, 'Curve'] = 'Vonnegart Curve'
    
    story_df = story_df.append(vonnegart_df)
    story_df.loc[:, 'Story'] = story_name
    
    storys_df = storys_df.append(story_df)
    
sns.relplot(x='Percent', y='Sentiment', col='Story', hue='Curve', style='Curve', kind='line', data=storys_df);


def get_extremes(story_name, n_peaks=5, mode='peaks'):
    
    # Sort the story segments by sentiment
    if mode=='peaks':
        ascending = False
    elif mode=='troughs':
        ascending = True
    else:
        raise ValueError('mode should be "peaks" or "troughs"')
    story_df = storys_df[(storys_df['Story']==story_name) & (storys_df['Curve']=='Empirical')]
    story_df.sort_values('Sentiment', inplace=True, ascending=ascending)
    
    # This loop makes sure that each discrete peak is only represented once
    top_lines = pd.DataFrame(columns=story_df.columns)
    for i in range(n_peaks):
        story_df.reset_index(drop=True, inplace=True)
        top_row = story_df.loc[0, :]
        top_lines.loc[i, :] = top_row
        percentage = top_row['Percent']
        
        # exclude all other rows within 10 percentage points of this peak
        story_df = story_df[ (story_df['Percent'] > percentage + 10) | (story_df['Percent'] < percentage - 10) ]
        
    # Make the dataframe look pretty
    def words_to_text(words):
        text = ' '.join(words)
        text_len = 160
        if len(text) > text_len:
            half_len = text_len//2
            text = text[:half_len] + ' ... ' + text[-half_len:]
        return text
    top_lines['Text'] = top_lines['Words'].apply(words_to_text)
    top_lines = top_lines[['Percent', 'Sentiment', 'Text']]
                            
    return top_lines


get_extremes('Hamlet', mode='peaks')

/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()

	Percent	Words	Sentiment
0	0	[Cinderella, ’, s, mother, died, while, she, was, a, very, little, child, ,, leaving, her, to]	-0.015625
1	1	[the, care, of, her, father, and, her, step-sisters, ,, who, were, very, much, older, than, herself]	0.070312
2	2	[;, for, Cinderella, ’, s, father, had, been, twice, married, ,, and, her, mother, was, his]	-0.007812
3	3	[second, wife, ., Now, ,, Cinderella, ’, s, sisters, did, not, love, her, ,, and, were]	-0.023438
4	4	[very, unkind, to, her, ., As, she, grew, older, they, made, her, work, as, a, servant]	0.070312

	Percent	Sentiment	Text
1	1	0.070312	the care of her father and her step-sisters , who were very much older than herself
57	57	0.078125	envious eyes , and knew that they wished they were as beautiful , and as well-dressed
7	7	0.085938	well known by it that her proper one has been forgotten.She was a very sweet-tempered ,
65	65	0.093750	have new dresses , for she is so splendid . She makes every one look shabby.
45	45	0.117188	amused to hear them admire her grace and beauty , and say that they were sure

	Percent	Sentiment	Text
70	70	-0.093750	out of his sight , and Cinderella , who was getting a little spoiled by all
89	89	-0.078125	be a Princess tried to put it on , but in vain . Cinderella ’ s
86	86	-0.054688	and as he felt sure that no one else could wear such a tiny shoe as
55	55	-0.054688	he asked her to dance , and would have no other partner , and as he
90	90	-0.046875	sisters tried , but could not get it on , and then Cinderella asked if she

	Percent	Sentiment	Text
0	26	1	GERTRUDE Thanks , Guildenstern and gentle Rosencrantz : And I beseech you instan ... rom Norway , and in fine Makes vow before his uncle never more To give the assay
1	45	0.230961	do not that way tend ; Nor what he spake , though it lack 'd form a little , Was ... a robustious periwig-pated fellow tear a passion to tatters , to very rags , to
2	0	0.180527	ACT I SCENE I. Elsinore . A platform before the castle . FRANCISCO at his post . ... of heaven Where now it burns , Marcellus and myself , The bell then beating one
3	78	0.127854	how otherwise ? -- Will you be ruled by me ? LAERTES Ay , my lord ; So you will ... ING CLAUDIUS He made confession of you , And gave you such a masterly report For
4	66	0.0867992	you me for a sponge , my lord ? HAMLET Ay , sir , that soaks up the king 's coun ... ? HAMLET At supper . KING CLAUDIUS At supper ! where ? HAMLET Not where he eats

The Shape of Stories¶

13/01/2020¶

Sentiment Analysis¶

Text Extraction¶

Data Inspection¶

The Sentiment Plot¶

Smoothing the Sentiment Plot¶

Sliding Window¶

EWMA¶

All The Stories!¶

Conclusion¶