Sentiment classification using simple neural network

Load Dataset

In [2]:
def pretty_print_review_and_label(i):
    print(labels[i] + "\t:\t" + reviews[i][:80] + "...")

g = open('reviews.txt','r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()

g = open('labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()
In [3]:
len(reviews)
Out[3]:
25000
In [4]:
reviews[0]
Out[4]:
'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   '
In [5]:
labels[0]
Out[5]:
'POSITIVE'

Develop a Predictive Theory

In [6]:
print("labels.txt \t : \t reviews.txt\n")
pretty_print_review_and_label(2137)
pretty_print_review_and_label(12816)
pretty_print_review_and_label(6267)
pretty_print_review_and_label(21934)
pretty_print_review_and_label(5297)
pretty_print_review_and_label(4998)
labels.txt 	 : 	 reviews.txt

NEGATIVE	:	this movie is terrible but it has some good effects .  ...
POSITIVE	:	adrian pasdar is excellent is this film . he makes a fascinating woman .  ...
NEGATIVE	:	comment this movie is impossible . is terrible  very improbable  bad interpretat...
POSITIVE	:	excellent episode movie ala pulp fiction .  days   suicides . it doesnt get more...
NEGATIVE	:	if you haven  t seen this  it  s terrible . it is pure trash . i saw this about ...
POSITIVE	:	this schiffer guy is a real genius  the movie is of excellent quality and both e...

Quick Theory Validation

In [7]:
from collections import Counter
import numpy as np

We'll create three Counter objects, one for words from postive reviews, one for words from negative reviews, and one for all the words.

In [8]:
# Create three Counter objects to store positive, negative and total counts
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()
In [9]:
# Loop over all the words in all the reviews and increment the counts in the appropriate counter objects
for i in range(len(reviews)):
    if(labels[i] == 'POSITIVE'):
        for word in reviews[i].split(" "):
            positive_counts[word] += 1
            total_counts[word] += 1
    else:
        for word in reviews[i].split(" "):
            negative_counts[word] += 1
            total_counts[word] += 1

Run the following two cells to list the words used in positive reviews and negative reviews, respectively, ordered from most to least commonly used.

In [10]:
# Examine the counts of the most common words in positive reviews
positive_counts.most_common()
Out[10]:
[('', 550468),
 ('the', 173324),
 ('.', 159654),
 ('and', 89722),
 ('a', 83688),
 ('of', 76855),
 ('to', 66746),
 ('is', 57245),
 ('in', 50215),
 ('br', 49235),
 ('it', 48025),
 ('i', 40743),
 ('that', 35630),
 ('this', 35080),
 ('s', 33815),
 ('as', 26308),
 ('with', 23247),
 ('for', 22416),
 ('was', 21917),
 ('film', 20937),
 ('but', 20822),
 ('movie', 19074),
 ('his', 17227),
 ('on', 17008),
 ('you', 16681),
 ('he', 16282),
 ('are', 14807),
 ('not', 14272),
 ('t', 13720),
 ('one', 13655),
 ('have', 12587),
 ('be', 12416),
 ('by', 11997),
 ('all', 11942),
 ('who', 11464),
 ('an', 11294),
 ('at', 11234),
 ('from', 10767),
 ('her', 10474),
 ('they', 9895),
 ('has', 9186),
 ('so', 9154),
 ('like', 9038),
 ('about', 8313),
 ('very', 8305),
 ('out', 8134),
 ('there', 8057),
 ('she', 7779),
 ('what', 7737),
 ('or', 7732),
 ('good', 7720),
 ('more', 7521),
 ('when', 7456),
 ('some', 7441),
 ('if', 7285),
 ('just', 7152),
 ('can', 7001),
 ('story', 6780),
 ('time', 6515),
 ('my', 6488),
 ('great', 6419),
 ('well', 6405),
 ('up', 6321),
 ('which', 6267),
 ('their', 6107),
 ('see', 6026),
 ('also', 5550),
 ('we', 5531),
 ('really', 5476),
 ('would', 5400),
 ('will', 5218),
 ('me', 5167),
 ('had', 5148),
 ('only', 5137),
 ('him', 5018),
 ('even', 4964),
 ('most', 4864),
 ('other', 4858),
 ('were', 4782),
 ('first', 4755),
 ('than', 4736),
 ('much', 4685),
 ('its', 4622),
 ('no', 4574),
 ('into', 4544),
 ('people', 4479),
 ('best', 4319),
 ('love', 4301),
 ('get', 4272),
 ('how', 4213),
 ('life', 4199),
 ('been', 4189),
 ('because', 4079),
 ('way', 4036),
 ('do', 3941),
 ('made', 3823),
 ('films', 3813),
 ('them', 3805),
 ('after', 3800),
 ('many', 3766),
 ('two', 3733),
 ('too', 3659),
 ('think', 3655),
 ('movies', 3586),
 ('characters', 3560),
 ('character', 3514),
 ('don', 3468),
 ('man', 3460),
 ('show', 3432),
 ('watch', 3424),
 ('seen', 3414),
 ('then', 3358),
 ('little', 3341),
 ('still', 3340),
 ('make', 3303),
 ('could', 3237),
 ('never', 3226),
 ('being', 3217),
 ('where', 3173),
 ('does', 3069),
 ('over', 3017),
 ('any', 3002),
 ('while', 2899),
 ('know', 2833),
 ('did', 2790),
 ('years', 2758),
 ('here', 2740),
 ('ever', 2734),
 ('end', 2696),
 ('these', 2694),
 ('such', 2590),
 ('real', 2568),
 ('scene', 2567),
 ('back', 2547),
 ('those', 2485),
 ('though', 2475),
 ('off', 2463),
 ('new', 2458),
 ('your', 2453),
 ('go', 2440),
 ('acting', 2437),
 ('plot', 2432),
 ('world', 2429),
 ('scenes', 2427),
 ('say', 2414),
 ('through', 2409),
 ('makes', 2390),
 ('better', 2381),
 ('now', 2368),
 ('work', 2346),
 ('young', 2343),
 ('old', 2311),
 ('ve', 2307),
 ('find', 2272),
 ('both', 2248),
 ('before', 2177),
 ('us', 2162),
 ('again', 2158),
 ('series', 2153),
 ('quite', 2143),
 ('something', 2135),
 ('cast', 2133),
 ('should', 2121),
 ('part', 2098),
 ('always', 2088),
 ('lot', 2087),
 ('another', 2075),
 ('actors', 2047),
 ('director', 2040),
 ('family', 2032),
 ('own', 2016),
 ('between', 2016),
 ('m', 1998),
 ('may', 1997),
 ('same', 1972),
 ('role', 1967),
 ('watching', 1966),
 ('every', 1954),
 ('funny', 1953),
 ('doesn', 1935),
 ('performance', 1928),
 ('few', 1918),
 ('bad', 1907),
 ('look', 1900),
 ('re', 1884),
 ('why', 1855),
 ('things', 1849),
 ('times', 1832),
 ('big', 1815),
 ('however', 1795),
 ('actually', 1790),
 ('action', 1789),
 ('going', 1783),
 ('bit', 1757),
 ('comedy', 1742),
 ('down', 1740),
 ('music', 1738),
 ('must', 1728),
 ('take', 1709),
 ('saw', 1692),
 ('long', 1690),
 ('right', 1688),
 ('fun', 1686),
 ('fact', 1684),
 ('excellent', 1683),
 ('around', 1674),
 ('didn', 1672),
 ('without', 1671),
 ('thing', 1662),
 ('thought', 1639),
 ('got', 1635),
 ('each', 1630),
 ('day', 1614),
 ('feel', 1597),
 ('seems', 1596),
 ('come', 1594),
 ('done', 1586),
 ('beautiful', 1580),
 ('especially', 1572),
 ('played', 1571),
 ('almost', 1566),
 ('want', 1562),
 ('yet', 1556),
 ('give', 1553),
 ('pretty', 1549),
 ('last', 1543),
 ('since', 1519),
 ('different', 1504),
 ('although', 1501),
 ('gets', 1490),
 ('true', 1487),
 ('interesting', 1481),
 ('job', 1470),
 ('enough', 1455),
 ('our', 1454),
 ('shows', 1447),
 ('horror', 1441),
 ('woman', 1439),
 ('tv', 1400),
 ('probably', 1398),
 ('father', 1395),
 ('original', 1393),
 ('girl', 1390),
 ('point', 1379),
 ('plays', 1378),
 ('wonderful', 1372),
 ('course', 1358),
 ('far', 1358),
 ('john', 1350),
 ('rather', 1340),
 ('isn', 1328),
 ('ll', 1326),
 ('dvd', 1324),
 ('later', 1324),
 ('war', 1310),
 ('whole', 1310),
 ('d', 1307),
 ('away', 1306),
 ('found', 1306),
 ('screen', 1305),
 ('nothing', 1300),
 ('year', 1297),
 ('once', 1296),
 ('hard', 1294),
 ('together', 1280),
 ('set', 1277),
 ('am', 1277),
 ('having', 1266),
 ('making', 1265),
 ('place', 1263),
 ('might', 1260),
 ('comes', 1260),
 ('sure', 1253),
 ('american', 1248),
 ('play', 1245),
 ('kind', 1244),
 ('takes', 1242),
 ('perfect', 1242),
 ('performances', 1237),
 ('himself', 1230),
 ('worth', 1221),
 ('everyone', 1221),
 ('anyone', 1214),
 ('actor', 1203),
 ('three', 1201),
 ('wife', 1196),
 ('classic', 1192),
 ('goes', 1186),
 ('ending', 1178),
 ('version', 1168),
 ('star', 1149),
 ('enjoy', 1146),
 ('book', 1142),
 ('nice', 1132),
 ('everything', 1128),
 ('during', 1124),
 ('put', 1118),
 ('seeing', 1111),
 ('least', 1102),
 ('house', 1100),
 ('high', 1095),
 ('watched', 1094),
 ('men', 1087),
 ('loved', 1087),
 ('night', 1082),
 ('anything', 1075),
 ('believe', 1071),
 ('guy', 1071),
 ('top', 1063),
 ('amazing', 1058),
 ('hollywood', 1056),
 ('looking', 1053),
 ('main', 1044),
 ('definitely', 1043),
 ('gives', 1031),
 ('home', 1029),
 ('seem', 1028),
 ('episode', 1023),
 ('sense', 1020),
 ('audience', 1020),
 ('truly', 1017),
 ('special', 1011),
 ('fan', 1009),
 ('second', 1009),
 ('short', 1009),
 ('mind', 1005),
 ('human', 1001),
 ('recommend', 999),
 ('full', 996),
 ('black', 995),
 ('help', 991),
 ('along', 989),
 ('trying', 987),
 ('small', 986),
 ('death', 985),
 ('friends', 981),
 ('remember', 974),
 ('often', 970),
 ('said', 966),
 ('favorite', 962),
 ('heart', 959),
 ('early', 957),
 ('left', 956),
 ('until', 955),
 ('let', 954),
 ('script', 954),
 ('maybe', 937),
 ('today', 936),
 ('less', 934),
 ('live', 934),
 ('moments', 933),
 ('others', 929),
 ('brilliant', 926),
 ('shot', 925),
 ('liked', 923),
 ('become', 916),
 ('won', 915),
 ('used', 910),
 ('style', 907),
 ('mother', 895),
 ('lives', 894),
 ('came', 893),
 ('stars', 890),
 ('cinema', 889),
 ('looks', 885),
 ('perhaps', 884),
 ('read', 882),
 ('enjoyed', 879),
 ('boy', 875),
 ('drama', 873),
 ('highly', 871),
 ('given', 870),
 ('playing', 867),
 ('use', 864),
 ('next', 859),
 ('women', 858),
 ('fine', 857),
 ('effects', 856),
 ('kids', 854),
 ('entertaining', 853),
 ('need', 852),
 ('line', 850),
 ('works', 848),
 ('someone', 847),
 ('mr', 836),
 ('simply', 835),
 ('picture', 833),
 ('children', 833),
 ('keep', 831),
 ('friend', 831),
 ('face', 831),
 ('dark', 830),
 ('overall', 828),
 ('certainly', 828),
 ('minutes', 827),
 ('wasn', 824),
 ('history', 822),
 ('finally', 820),
 ('couple', 816),
 ('against', 815),
 ('son', 809),
 ('understand', 808),
 ('lost', 807),
 ('michael', 805),
 ('else', 801),
 ('throughout', 798),
 ('fans', 797),
 ('city', 792),
 ('reason', 789),
 ('written', 787),
 ('production', 787),
 ('several', 784),
 ('school', 783),
 ('rest', 781),
 ('based', 781),
 ('try', 780),
 ('dead', 776),
 ('hope', 775),
 ('strong', 768),
 ('white', 765),
 ('tell', 759),
 ('itself', 758),
 ('half', 753),
 ('person', 749),
 ('sometimes', 746),
 ('start', 744),
 ('past', 744),
 ('genre', 743),
 ('beginning', 739),
 ('final', 739),
 ('town', 738),
 ('art', 734),
 ('game', 732),
 ('humor', 732),
 ('yes', 731),
 ('idea', 731),
 ('late', 730),
 ('despite', 729),
 ('becomes', 729),
 ('case', 726),
 ('able', 726),
 ('money', 723),
 ('child', 721),
 ('completely', 721),
 ('side', 719),
 ('camera', 716),
 ('getting', 714),
 ('instead', 712),
 ('soon', 702),
 ('under', 700),
 ('viewer', 699),
 ('age', 697),
 ('stories', 696),
 ('days', 696),
 ('simple', 694),
 ('felt', 694),
 ('roles', 693),
 ('video', 688),
 ('either', 683),
 ('name', 683),
 ('doing', 677),
 ('turns', 674),
 ('close', 671),
 ('wants', 671),
 ('title', 669),
 ('wrong', 668),
 ('went', 666),
 ('james', 665),
 ('evil', 659),
 ('budget', 657),
 ('episodes', 657),
 ('relationship', 655),
 ('piece', 653),
 ('fantastic', 653),
 ('david', 651),
 ('turn', 648),
 ('murder', 646),
 ('parts', 645),
 ('brother', 644),
 ('absolutely', 643),
 ('head', 643),
 ('experience', 642),
 ('eyes', 641),
 ('sex', 638),
 ('called', 637),
 ('direction', 637),
 ('directed', 636),
 ('lines', 634),
 ('behind', 633),
 ('sort', 632),
 ('actress', 631),
 ('lead', 630),
 ('oscar', 628),
 ('example', 627),
 ('including', 627),
 ('known', 625),
 ('musical', 625),
 ('chance', 621),
 ('score', 620),
 ('hit', 619),
 ('feeling', 619),
 ('already', 619),
 ('voice', 615),
 ('living', 612),
 ('moment', 612),
 ('low', 610),
 ('supporting', 610),
 ('ago', 609),
 ('themselves', 608),
 ('reality', 605),
 ('hilarious', 605),
 ('jack', 604),
 ('told', 603),
 ('hand', 601),
 ('quality', 600),
 ('dialogue', 600),
 ('moving', 600),
 ('happy', 599),
 ('song', 599),
 ('paul', 598),
 ('matter', 598),
 ('light', 594),
 ('future', 593),
 ('entire', 592),
 ('finds', 591),
 ('gave', 589),
 ('laugh', 587),
 ('released', 586),
 ('expect', 584),
 ('fight', 581),
 ('particularly', 580),
 ('police', 579),
 ('cinematography', 579),
 ('sound', 578),
 ('whose', 578),
 ('type', 578),
 ('enjoyable', 573),
 ('view', 573),
 ('romantic', 572),
 ('husband', 572),
 ('daughter', 572),
 ('number', 572),
 ('documentary', 571),
 ('self', 570),
 ('robert', 569),
 ('modern', 569),
 ('took', 569),
 ('superb', 569),
 ('mean', 566),
 ('shown', 563),
 ('coming', 561),
 ('important', 560),
 ('king', 559),
 ('leave', 559),
 ('change', 558),
 ('wanted', 555),
 ('somewhat', 555),
 ('tells', 554),
 ('country', 552),
 ('run', 552),
 ('career', 552),
 ('events', 552),
 ('heard', 550),
 ('season', 550),
 ('girls', 549),
 ('greatest', 549),
 ('etc', 547),
 ('care', 546),
 ('starts', 545),
 ('english', 542),
 ('killer', 541),
 ('animation', 540),
 ('guys', 540),
 ('totally', 540),
 ('tale', 540),
 ('usual', 539),
 ('miss', 535),
 ('opinion', 535),
 ('violence', 531),
 ('easy', 531),
 ('songs', 530),
 ('british', 528),
 ('says', 526),
 ('realistic', 525),
 ('writing', 524),
 ('writer', 522),
 ('act', 522),
 ('comic', 521),
 ('thriller', 519),
 ('television', 517),
 ('power', 516),
 ('ones', 515),
 ('kid', 514),
 ('york', 513),
 ('novel', 513),
 ('problem', 512),
 ('alone', 512),
 ('attention', 509),
 ('involved', 508),
 ('kill', 507),
 ('extremely', 507),
 ('seemed', 506),
 ('french', 505),
 ('hero', 505),
 ('rock', 504),
 ('stuff', 501),
 ('wish', 499),
 ('begins', 498),
 ('sad', 497),
 ('taken', 497),
 ('ways', 496),
 ('richard', 495),
 ('knows', 494),
 ('atmosphere', 493),
 ('car', 491),
 ('taking', 491),
 ('surprised', 491),
 ('similar', 491),
 ('george', 490),
 ('perfectly', 490),
 ('team', 489),
 ('across', 489),
 ('sequence', 489),
 ('eye', 489),
 ('due', 488),
 ('serious', 488),
 ('room', 488),
 ('powerful', 488),
 ('among', 488),
 ('b', 487),
 ('cannot', 487),
 ('strange', 487),
 ('order', 487),
 ('beauty', 486),
 ('famous', 485),
 ('tries', 484),
 ('myself', 484),
 ('herself', 484),
 ('happened', 484),
 ('class', 483),
 ('four', 482),
 ('cool', 481),
 ('theme', 479),
 ('release', 479),
 ('anyway', 479),
 ('opening', 478),
 ('entertainment', 477),
 ('slow', 475),
 ('exactly', 475),
 ('ends', 475),
 ('unique', 475),
 ('easily', 474),
 ('o', 474),
 ('level', 474),
 ('red', 474),
 ('interest', 472),
 ('happen', 471),
 ('crime', 470),
 ('viewing', 468),
 ('memorable', 467),
 ('sets', 467),
 ('group', 466),
 ('stop', 466),
 ('sister', 463),
 ('working', 463),
 ('dance', 463),
 ('message', 463),
 ('problems', 463),
 ('knew', 462),
 ('nature', 461),
 ('mystery', 461),
 ('bring', 460),
 ('thinking', 459),
 ('brought', 459),
 ('believable', 459),
 ('mostly', 458),
 ('disney', 457),
 ('couldn', 457),
 ('society', 456),
 ('lady', 455),
 ('within', 455),
 ('blood', 454),
 ('parents', 453),
 ('viewers', 453),
 ('upon', 453),
 ('soundtrack', 452),
 ('form', 452),
 ('meets', 452),
 ('tom', 452),
 ('peter', 452),
 ('usually', 452),
 ('local', 450),
 ('follow', 448),
 ('certain', 448),
 ('whether', 447),
 ('possible', 446),
 ('emotional', 445),
 ('de', 444),
 ('killed', 444),
 ('above', 444),
 ('god', 443),
 ('middle', 443),
 ('happens', 442),
 ('flick', 442),
 ('needs', 442),
 ('masterpiece', 441),
 ('period', 440),
 ('major', 440),
 ('haven', 439),
 ('named', 439),
 ('particular', 438),
 ('th', 438),
 ('earth', 437),
 ('feature', 437),
 ('stand', 436),
 ('words', 435),
 ('typical', 435),
 ('obviously', 433),
 ('elements', 433),
 ('romance', 431),
 ('jane', 430),
 ('yourself', 427),
 ('showing', 427),
 ('fantasy', 426),
 ('brings', 426),
 ('america', 423),
 ('guess', 423),
 ('huge', 422),
 ('unfortunately', 422),
 ('running', 421),
 ('indeed', 421),
 ('talent', 420),
 ('stage', 419),
 ('started', 418),
 ('japanese', 417),
 ('leads', 417),
 ('sweet', 417),
 ('deal', 416),
 ('poor', 416),
 ('personal', 413),
 ('incredible', 413),
 ('fast', 412),
 ('became', 410),
 ('deep', 410),
 ('hours', 409),
 ('giving', 408),
 ('nearly', 408),
 ('dream', 408),
 ('clearly', 407),
 ('turned', 407),
 ('near', 406),
 ('obvious', 406),
 ('surprise', 405),
 ('cut', 405),
 ('era', 404),
 ('body', 404),
 ('female', 403),
 ('five', 403),
 ('hour', 403),
 ('note', 399),
 ('truth', 398),
 ('learn', 398),
 ('except', 397),
 ('match', 397),
 ('tony', 397),
 ('feels', 397),
 ('clear', 394),
 ('complete', 394),
 ('filmed', 394),
 ('eventually', 393),
 ('lots', 393),
 ('street', 393),
 ('keeps', 393),
 ('older', 393),
 ('buy', 392),
 ('stewart', 391),
 ('william', 391),
 ('fall', 390),
 ('meet', 390),
 ('joe', 390),
 ('difficult', 389),
 ('shots', 389),
 ('talking', 389),
 ('unlike', 389),
 ('rating', 389),
 ('dramatic', 388),
 ('means', 388),
 ('subject', 386),
 ('situation', 386),
 ('present', 386),
 ('appears', 386),
 ('wonder', 386),
 ('comments', 385),
 ('sequences', 383),
 ('general', 383),
 ('lee', 383),
 ('points', 382),
 ('earlier', 382),
 ('gone', 379),
 ('check', 379),
 ('suspense', 378),
 ('ten', 378),
 ('recommended', 378),
 ('business', 377),
 ('third', 377),
 ('leaves', 375),
 ('talk', 375),
 ('beyond', 375),
 ('portrayal', 374),
 ('beautifully', 373),
 ('bill', 372),
 ('single', 372),
 ('plenty', 371),
 ('word', 371),
 ('whom', 370),
 ('falls', 370),
 ('non', 369),
 ('battle', 369),
 ('figure', 369),
 ('scary', 369),
 ('using', 368),
 ('return', 368),
 ('add', 367),
 ('doubt', 367),
 ('solid', 366),
 ('success', 366),
 ('hear', 366),
 ('touching', 365),
 ('oh', 365),
 ('jokes', 365),
 ('political', 365),
 ('boys', 364),
 ('awesome', 364),
 ('hell', 364),
 ('sexual', 362),
 ('dog', 362),
 ('recently', 362),
 ('straight', 361),
 ('features', 361),
 ('please', 361),
 ('wouldn', 361),
 ('forget', 360),
 ('setting', 360),
 ('lack', 360),
 ('mark', 359),
 ('married', 359),
 ('social', 357),
 ('adventure', 356),
 ('interested', 356),
 ('terrific', 355),
 ('brothers', 355),
 ('sees', 355),
 ('actual', 355),
 ('move', 354),
 ('call', 354),
 ('dr', 353),
 ('various', 353),
 ('theater', 353),
 ('animated', 352),
 ('western', 351),
 ('baby', 350),
 ('space', 350),
 ('disappointed', 348),
 ('leading', 348),
 ('portrayed', 346),
 ('aren', 346),
 ('screenplay', 345),
 ('smith', 345),
 ('towards', 344),
 ('hate', 344),
 ('noir', 343),
 ('kelly', 342),
 ('decent', 342),
 ('outstanding', 342),
 ('journey', 341),
 ('directors', 341),
 ('effective', 340),
 ('looked', 340),
 ('none', 340),
 ('storyline', 339),
 ('caught', 339),
 ('sci', 339),
 ('fi', 339),
 ('cold', 339),
 ('mary', 339),
 ('charming', 338),
 ('rich', 338),
 ('manages', 337),
 ('popular', 337),
 ('harry', 337),
 ('rare', 337),
 ('spirit', 336),
 ('appreciate', 335),
 ('open', 335),
 ('moves', 334),
 ('basically', 334),
 ('acted', 334),
 ('pace', 333),
 ('mention', 333),
 ('boring', 333),
 ('subtle', 333),
 ('deserves', 333),
 ('inside', 333),
 ('century', 333),
 ('background', 332),
 ('familiar', 332),
 ('ben', 331),
 ('supposed', 330),
 ('creepy', 330),
 ('secret', 329),
 ('die', 328),
 ('jim', 328),
 ('effect', 327),
 ('question', 327),
 ('natural', 327),
 ('language', 326),
 ('impressive', 326),
 ('rate', 326),
 ('saying', 325),
 ('intelligent', 325),
 ('material', 324),
 ('scott', 324),
 ('realize', 324),
 ('telling', 324),
 ('singing', 323),
 ('dancing', 322),
 ('imagine', 321),
 ('visual', 321),
 ('adult', 321),
 ('office', 320),
 ('kept', 320),
 ('uses', 319),
 ('stunning', 318),
 ('wait', 318),
 ('pure', 318),
 ('review', 317),
 ('copy', 317),
 ('previous', 317),
 ('seriously', 317),
 ('created', 316),
 ('magic', 316),
 ('reading', 316),
 ('create', 316),
 ('somehow', 316),
 ('hot', 316),
 ('frank', 315),
 ('attempt', 315),
 ('crazy', 315),
 ('escape', 315),
 ('stay', 315),
 ('air', 315),
 ('hands', 314),
 ('filled', 313),
 ('average', 312),
 ('surprisingly', 312),
 ('expected', 312),
 ('complex', 311),
 ('successful', 310),
 ('quickly', 310),
 ('studio', 310),
 ('male', 309),
 ('plus', 309),
 ('co', 307),
 ('casting', 306),
 ('following', 306),
 ('images', 306),
 ('minute', 306),
 ('exciting', 306),
 ('follows', 305),
 ('reasons', 305),
 ('members', 305),
 ('german', 305),
 ('e', 305),
 ('themes', 305),
 ('cute', 304),
 ('touch', 304),
 ('free', 304),
 ('genius', 304),
 ('edge', 304),
 ('outside', 303),
 ('reviews', 302),
 ('ok', 302),
 ('younger', 302),
 ('admit', 302),
 ('odd', 301),
 ('fighting', 301),
 ('master', 301),
 ('recent', 300),
 ('comment', 300),
 ('thanks', 300),
 ('break', 300),
 ('apart', 299),
 ('lovely', 298),
 ('begin', 298),
 ('emotions', 298),
 ('party', 297),
 ('italian', 297),
 ('doctor', 297),
 ('sequel', 296),
 ('la', 296),
 ...]
In [11]:
# Examine the counts of the most common words in negative reviews
negative_counts.most_common()
Out[11]:
[('', 561462),
 ('.', 167538),
 ('the', 163389),
 ('a', 79321),
 ('and', 74385),
 ('of', 69009),
 ('to', 68974),
 ('br', 52637),
 ('is', 50083),
 ('it', 48327),
 ('i', 46880),
 ('in', 43753),
 ('this', 40920),
 ('that', 37615),
 ('s', 31546),
 ('was', 26291),
 ('movie', 24965),
 ('for', 21927),
 ('but', 21781),
 ('with', 20878),
 ('as', 20625),
 ('t', 20361),
 ('film', 19218),
 ('you', 17549),
 ('on', 17192),
 ('not', 16354),
 ('have', 15144),
 ('are', 14623),
 ('be', 14541),
 ('he', 13856),
 ('one', 13134),
 ('they', 13011),
 ('at', 12279),
 ('his', 12147),
 ('all', 12036),
 ('so', 11463),
 ('like', 11238),
 ('there', 10775),
 ('just', 10619),
 ('by', 10549),
 ('or', 10272),
 ('an', 10266),
 ('who', 9969),
 ('from', 9731),
 ('if', 9518),
 ('about', 9061),
 ('out', 8979),
 ('what', 8422),
 ('some', 8306),
 ('no', 8143),
 ('her', 7947),
 ('even', 7687),
 ('can', 7653),
 ('has', 7604),
 ('good', 7423),
 ('bad', 7401),
 ('would', 7036),
 ('up', 6970),
 ('only', 6781),
 ('more', 6730),
 ('when', 6726),
 ('she', 6444),
 ('really', 6262),
 ('time', 6209),
 ('had', 6142),
 ('my', 6015),
 ('were', 6001),
 ('which', 5780),
 ('very', 5764),
 ('me', 5606),
 ('see', 5452),
 ('don', 5336),
 ('we', 5328),
 ('their', 5278),
 ('do', 5236),
 ('story', 5208),
 ('than', 5183),
 ('been', 5100),
 ('much', 5078),
 ('get', 5037),
 ('because', 4966),
 ('people', 4806),
 ('then', 4761),
 ('make', 4722),
 ('how', 4688),
 ('could', 4686),
 ('any', 4658),
 ('into', 4567),
 ('made', 4541),
 ('first', 4306),
 ('other', 4305),
 ('well', 4254),
 ('too', 4174),
 ('them', 4165),
 ('plot', 4154),
 ('movies', 4080),
 ('acting', 4056),
 ('will', 3993),
 ('way', 3989),
 ('most', 3919),
 ('him', 3858),
 ('after', 3838),
 ('its', 3655),
 ('think', 3643),
 ('also', 3608),
 ('characters', 3600),
 ('off', 3567),
 ('watch', 3550),
 ('did', 3506),
 ('character', 3506),
 ('why', 3463),
 ('being', 3393),
 ('better', 3358),
 ('know', 3334),
 ('over', 3316),
 ('seen', 3265),
 ('ever', 3263),
 ('never', 3259),
 ('your', 3233),
 ('where', 3219),
 ('two', 3173),
 ('little', 3096),
 ('films', 3077),
 ('here', 3027),
 ('m', 3000),
 ('nothing', 2990),
 ('say', 2982),
 ('end', 2954),
 ('something', 2942),
 ('should', 2920),
 ('many', 2909),
 ('does', 2871),
 ('thing', 2866),
 ('show', 2862),
 ('ve', 2829),
 ('scene', 2816),
 ('scenes', 2785),
 ('these', 2724),
 ('go', 2717),
 ('didn', 2646),
 ('watching', 2640),
 ('great', 2640),
 ('re', 2620),
 ('doesn', 2601),
 ('through', 2560),
 ('such', 2544),
 ('man', 2516),
 ('worst', 2480),
 ('actually', 2449),
 ('actors', 2437),
 ('life', 2429),
 ('back', 2424),
 ('while', 2418),
 ('director', 2405),
 ('funny', 2336),
 ('going', 2319),
 ('still', 2283),
 ('another', 2254),
 ('look', 2247),
 ('now', 2237),
 ('old', 2215),
 ('those', 2212),
 ('real', 2170),
 ('few', 2158),
 ('love', 2152),
 ('horror', 2150),
 ('before', 2147),
 ('want', 2141),
 ('minutes', 2126),
 ('pretty', 2115),
 ('best', 2094),
 ('though', 2091),
 ('same', 2081),
 ('script', 2074),
 ('work', 2027),
 ('every', 2025),
 ('seems', 2023),
 ('least', 2011),
 ('enough', 1997),
 ('down', 1988),
 ('original', 1983),
 ('guy', 1964),
 ('got', 1952),
 ('around', 1943),
 ('part', 1942),
 ('lot', 1892),
 ('anything', 1874),
 ('find', 1860),
 ('new', 1854),
 ('again', 1849),
 ('isn', 1849),
 ('point', 1845),
 ('things', 1839),
 ('fact', 1839),
 ('give', 1823),
 ('makes', 1814),
 ('take', 1800),
 ('thought', 1798),
 ('d', 1770),
 ('whole', 1768),
 ('long', 1761),
 ('years', 1759),
 ('however', 1740),
 ('gets', 1714),
 ('making', 1695),
 ('cast', 1694),
 ('big', 1662),
 ('might', 1658),
 ('interesting', 1648),
 ('money', 1638),
 ('us', 1628),
 ('right', 1625),
 ('far', 1619),
 ('quite', 1596),
 ('without', 1595),
 ('come', 1595),
 ('almost', 1574),
 ('ll', 1567),
 ('action', 1566),
 ('awful', 1557),
 ('kind', 1539),
 ('reason', 1534),
 ('am', 1530),
 ('looks', 1528),
 ('must', 1522),
 ('done', 1510),
 ('comedy', 1504),
 ('someone', 1490),
 ('trying', 1486),
 ('wasn', 1484),
 ('poor', 1481),
 ('boring', 1478),
 ('instead', 1478),
 ('saw', 1475),
 ('away', 1469),
 ('girl', 1463),
 ('probably', 1444),
 ('believe', 1434),
 ('sure', 1433),
 ('looking', 1430),
 ('stupid', 1428),
 ('anyone', 1418),
 ('times', 1406),
 ('world', 1404),
 ('maybe', 1404),
 ('rather', 1394),
 ('terrible', 1391),
 ('last', 1390),
 ('may', 1390),
 ('since', 1388),
 ('let', 1385),
 ('tv', 1382),
 ('hard', 1374),
 ('between', 1374),
 ('waste', 1358),
 ('woman', 1356),
 ('feel', 1354),
 ('effects', 1348),
 ('half', 1341),
 ('own', 1333),
 ('young', 1317),
 ('music', 1316),
 ('idea', 1312),
 ('sense', 1306),
 ('bit', 1298),
 ('having', 1280),
 ('book', 1278),
 ('found', 1267),
 ('put', 1263),
 ('series', 1263),
 ('goes', 1256),
 ('worse', 1249),
 ('said', 1230),
 ('comes', 1224),
 ('role', 1222),
 ('main', 1220),
 ('else', 1199),
 ('everything', 1197),
 ('yet', 1196),
 ('low', 1189),
 ('screen', 1188),
 ('supposed', 1186),
 ('actor', 1185),
 ('either', 1183),
 ('budget', 1179),
 ('ending', 1179),
 ('audience', 1178),
 ('set', 1177),
 ('family', 1170),
 ('left', 1169),
 ('completely', 1168),
 ('both', 1158),
 ('wrong', 1155),
 ('always', 1151),
 ('course', 1148),
 ('place', 1148),
 ('seem', 1147),
 ('watched', 1142),
 ('day', 1132),
 ('simply', 1130),
 ('shot', 1126),
 ('mean', 1117),
 ('special', 1102),
 ('dead', 1101),
 ('three', 1094),
 ('house', 1085),
 ('oh', 1084),
 ('night', 1083),
 ('read', 1082),
 ('less', 1067),
 ('high', 1066),
 ('year', 1064),
 ('camera', 1061),
 ('worth', 1057),
 ('our', 1056),
 ('try', 1051),
 ('horrible', 1046),
 ('sex', 1046),
 ('video', 1043),
 ('black', 1039),
 ('although', 1036),
 ('couldn', 1036),
 ('once', 1033),
 ('rest', 1022),
 ('dvd', 1021),
 ('line', 1018),
 ('played', 1017),
 ('fun', 1007),
 ('during', 1006),
 ('production', 1003),
 ('everyone', 1002),
 ('play', 993),
 ('mind', 990),
 ('kids', 989),
 ('version', 989),
 ('seeing', 988),
 ('american', 980),
 ('given', 978),
 ('used', 969),
 ('performance', 968),
 ('especially', 963),
 ('together', 963),
 ('tell', 959),
 ('women', 958),
 ('start', 956),
 ('need', 955),
 ('second', 953),
 ('takes', 950),
 ('each', 950),
 ('wife', 944),
 ('dialogue', 942),
 ('use', 940),
 ('problem', 938),
 ('star', 934),
 ('unfortunately', 931),
 ('himself', 929),
 ('doing', 926),
 ('death', 922),
 ('name', 921),
 ('lines', 919),
 ('killer', 914),
 ('getting', 913),
 ('help', 905),
 ('fan', 902),
 ('couple', 902),
 ('head', 898),
 ('crap', 895),
 ('guess', 888),
 ('piece', 884),
 ('nice', 880),
 ('different', 878),
 ('school', 876),
 ('later', 875),
 ('entire', 869),
 ('shows', 860),
 ('next', 858),
 ('john', 858),
 ('seemed', 857),
 ('short', 857),
 ('hollywood', 850),
 ('home', 848),
 ('person', 846),
 ('true', 846),
 ('absolutely', 842),
 ('sort', 840),
 ('care', 839),
 ('understand', 836),
 ('plays', 835),
 ('felt', 834),
 ('written', 829),
 ('title', 828),
 ('men', 822),
 ('until', 821),
 ('flick', 816),
 ('decent', 815),
 ('face', 814),
 ('friends', 810),
 ('case', 807),
 ('stars', 807),
 ('job', 807),
 ('itself', 804),
 ('yes', 801),
 ('perhaps', 800),
 ('went', 797),
 ('wanted', 797),
 ('called', 796),
 ('annoying', 795),
 ('tries', 790),
 ('ridiculous', 790),
 ('laugh', 788),
 ('evil', 787),
 ('along', 786),
 ('top', 785),
 ('hour', 784),
 ('full', 783),
 ('came', 780),
 ('writing', 780),
 ('keep', 770),
 ('totally', 767),
 ('playing', 766),
 ('god', 765),
 ('won', 764),
 ('guys', 763),
 ('already', 762),
 ('gore', 757),
 ('direction', 748),
 ('save', 746),
 ('lost', 745),
 ('example', 744),
 ('sound', 742),
 ('war', 741),
 ('attempt', 735),
 ('except', 733),
 ('car', 733),
 ('moments', 732),
 ('blood', 732),
 ('obviously', 730),
 ('act', 729),
 ('remember', 728),
 ('kill', 727),
 ('truly', 726),
 ('father', 726),
 ('white', 726),
 ('b', 725),
 ('thinking', 720),
 ('ok', 716),
 ('finally', 716),
 ('turn', 711),
 ('quality', 701),
 ('lack', 698),
 ('style', 694),
 ('wouldn', 693),
 ('cheap', 691),
 ('none', 690),
 ('kid', 686),
 ('please', 686),
 ('boy', 685),
 ('seriously', 684),
 ('lead', 680),
 ('dull', 677),
 ('children', 676),
 ('starts', 675),
 ('stuff', 673),
 ('hope', 672),
 ('looked', 670),
 ('recommend', 669),
 ('under', 668),
 ('killed', 667),
 ('run', 667),
 ('others', 666),
 ('enjoy', 666),
 ('etc', 663),
 ('myself', 663),
 ('beginning', 662),
 ('against', 662),
 ('girls', 662),
 ('obvious', 660),
 ('small', 660),
 ('hell', 659),
 ('slow', 657),
 ('hand', 656),
 ('lame', 652),
 ('wonder', 652),
 ('picture', 651),
 ('becomes', 651),
 ('based', 650),
 ('early', 648),
 ('behind', 646),
 ('poorly', 644),
 ('avoid', 642),
 ('complete', 640),
 ('apparently', 640),
 ('happens', 639),
 ('anyway', 638),
 ('classic', 637),
 ('several', 636),
 ('episode', 635),
 ('certainly', 635),
 ('despite', 635),
 ('often', 631),
 ('writer', 630),
 ('cut', 630),
 ('predictable', 628),
 ('gave', 628),
 ('mother', 628),
 ('become', 627),
 ('close', 625),
 ('fans', 624),
 ('saying', 621),
 ('scary', 619),
 ('live', 618),
 ('stop', 618),
 ('wants', 617),
 ('self', 615),
 ('mr', 612),
 ('friend', 611),
 ('jokes', 611),
 ('cannot', 610),
 ('overall', 609),
 ('cinema', 604),
 ('child', 603),
 ('silly', 601),
 ('beautiful', 596),
 ('human', 595),
 ('expect', 594),
 ('liked', 593),
 ('happened', 592),
 ('bunch', 590),
 ('entertaining', 590),
 ('final', 588),
 ('actress', 588),
 ('says', 584),
 ('performances', 584),
 ('turns', 577),
 ('humor', 577),
 ('eyes', 576),
 ('themselves', 576),
 ('hours', 574),
 ('happen', 573),
 ('basically', 572),
 ('days', 572),
 ('running', 571),
 ('call', 569),
 ('disappointed', 569),
 ('involved', 569),
 ('directed', 568),
 ('group', 568),
 ('fight', 567),
 ('talking', 566),
 ('daughter', 566),
 ('body', 566),
 ('sorry', 565),
 ('badly', 565),
 ('throughout', 563),
 ('viewer', 563),
 ('yourself', 562),
 ('extremely', 562),
 ('interest', 561),
 ('heard', 561),
 ('violence', 561),
 ('shots', 559),
 ('side', 557),
 ('word', 556),
 ('art', 555),
 ('possible', 554),
 ('dark', 551),
 ('game', 551),
 ('hero', 550),
 ('alone', 549),
 ('son', 547),
 ('type', 547),
 ('leave', 547),
 ('gives', 546),
 ('single', 546),
 ('parts', 546),
 ('started', 545),
 ('female', 543),
 ('rating', 541),
 ('mess', 541),
 ('voice', 541),
 ('town', 540),
 ('aren', 540),
 ('drama', 538),
 ('definitely', 537),
 ('unless', 536),
 ('review', 534),
 ('effort', 533),
 ('weak', 533),
 ('able', 533),
 ('took', 531),
 ('non', 530),
 ('five', 530),
 ('matter', 529),
 ('usually', 529),
 ('michael', 528),
 ('feeling', 526),
 ('huge', 523),
 ('sequel', 522),
 ('soon', 521),
 ('exactly', 520),
 ('past', 519),
 ('police', 518),
 ('turned', 518),
 ('tried', 515),
 ('middle', 513),
 ('talent', 513),
 ('genre', 512),
 ('zombie', 510),
 ('history', 509),
 ('ends', 509),
 ('straight', 503),
 ('opening', 501),
 ('serious', 501),
 ('coming', 501),
 ('moment', 500),
 ('lives', 499),
 ('sad', 499),
 ('dialog', 498),
 ('particularly', 498),
 ('editing', 493),
 ('clearly', 492),
 ('earth', 491),
 ('beyond', 491),
 ('taken', 490),
 ('cool', 490),
 ('level', 489),
 ('dumb', 489),
 ('okay', 488),
 ('major', 487),
 ('premise', 485),
 ('fast', 485),
 ('stories', 484),
 ('joke', 484),
 ('wasted', 483),
 ('minute', 483),
 ('rent', 482),
 ('across', 482),
 ('mostly', 482),
 ('fails', 481),
 ('falls', 481),
 ('late', 481),
 ('mention', 478),
 ('theater', 475),
 ('stay', 472),
 ('sometimes', 472),
 ('hit', 468),
 ('talk', 467),
 ('fine', 467),
 ('die', 466),
 ('storyline', 465),
 ('pointless', 465),
 ('taking', 464),
 ('order', 462),
 ('brother', 461),
 ('told', 460),
 ('whatever', 460),
 ('wish', 458),
 ('room', 456),
 ('write', 455),
 ('appears', 455),
 ('career', 455),
 ('known', 454),
 ('husband', 454),
 ('living', 451),
 ('ten', 450),
 ('sit', 450),
 ('words', 449),
 ('monster', 448),
 ('chance', 448),
 ('novel', 444),
 ('hate', 444),
 ('add', 443),
 ('english', 443),
 ('somehow', 441),
 ('strange', 440),
 ('imdb', 438),
 ('actual', 438),
 ('material', 437),
 ('ones', 437),
 ('killing', 437),
 ('total', 437),
 ('knew', 436),
 ('king', 434),
 ('number', 434),
 ('using', 433),
 ('power', 431),
 ('shown', 431),
 ('giving', 431),
 ('lee', 431),
 ('works', 431),
 ('points', 430),
 ('possibly', 430),
 ('kept', 430),
 ('four', 429),
 ('local', 427),
 ('usual', 426),
 ('including', 425),
 ('ago', 424),
 ('problems', 424),
 ('opinion', 424),
 ('nudity', 423),
 ('age', 422),
 ('due', 421),
 ('roles', 420),
 ('writers', 419),
 ('decided', 419),
 ('flat', 418),
 ('easily', 418),
 ('near', 418),
 ('murder', 417),
 ('experience', 417),
 ('reviews', 416),
 ('imagine', 415),
 ('feels', 413),
 ('plain', 411),
 ('somewhat', 411),
 ('class', 410),
 ('score', 410),
 ('song', 409),
 ('bring', 409),
 ('whether', 409),
 ('whose', 408),
 ('average', 408),
 ('otherwise', 408),
 ('pathetic', 407),
 ('zombies', 407),
 ('nearly', 407),
 ('knows', 407),
 ('cheesy', 406),
 ('upon', 406),
 ('cinematography', 406),
 ('city', 405),
 ('space', 405),
 ('credits', 404),
 ('james', 403),
 ('lots', 403),
 ('change', 403),
 ('entertainment', 402),
 ('nor', 402),
 ('wait', 401),
 ('released', 400),
 ('needs', 399),
 ('shame', 398),
 ('attention', 396),
 ('comments', 394),
 ('lady', 393),
 ('free', 393),
 ('bored', 393),
 ('clear', 392),
 ('needed', 392),
 ('expected', 392),
 ('view', 391),
 ('development', 390),
 ('doubt', 390),
 ('check', 390),
 ('mystery', 389),
 ('figure', 389),
 ('garbage', 388),
 ('excellent', 388),
 ('sequence', 386),
 ('television', 386),
 ('o', 385),
 ('sets', 385),
 ('laughable', 384),
 ('potential', 384),
 ('country', 382),
 ('light', 382),
 ('robert', 382),
 ('reality', 382),
 ('documentary', 382),
 ('general', 381),
 ('ask', 381),
 ('fall', 380),
 ('begin', 380),
 ('comic', 380),
 ('stand', 379),
 ('remake', 379),
 ('trash', 379),
 ('forced', 379),
 ('footage', 379),
 ('thriller', 378),
 ('songs', 378),
 ('gay', 377),
 ('within', 377),
 ('hardly', 376),
 ('gone', 375),
 ('above', 375),
 ('george', 374),
 ('means', 373),
 ('sounds', 373),
 ('david', 372),
 ('move', 372),
 ('buy', 372),
 ('directing', 372),
 ('rock', 371),
 ('forward', 371),
 ('important', 371),
 ('british', 370),
 ('haven', 370),
 ('hot', 370),
 ('filmed', 370),
 ('reading', 369),
 ('fake', 369),
 ('heart', 369),
 ('incredibly', 368),
 ('weird', 368),
 ('hear', 368),
 ('cop', 367),
 ('enjoyed', 367),
 ('hilarious', 367),
 ('musical', 367),
 ('happy', 366),
 ('message', 366),
 ('pay', 366),
 ('box', 365),
 ('laughs', 365),
 ('sadly', 363),
 ('suspense', 363),
 ('eye', 362),
 ('similar', 361),
 ('third', 361),
 ('named', 361),
 ('modern', 360),
 ('events', 359),
 ('failed', 359),
 ('forget', 358),
 ('question', 358),
 ('male', 357),
 ('finds', 357),
 ('perfect', 356),
 ('sister', 355),
 ('spent', 355),
 ('feature', 354),
 ('result', 354),
 ('comment', 353),
 ('girlfriend', 353),
 ('sexual', 352),
 ('attempts', 351),
 ('richard', 351),
 ('neither', 351),
 ('screenplay', 350),
 ('elements', 350),
 ('spoilers', 349),
 ('filmmakers', 348),
 ('showing', 348),
 ('brain', 348),
 ('dr', 347),
 ('miss', 347),
 ('christmas', 347),
 ('cover', 345),
 ('sequences', 344),
 ('red', 344),
 ('excuse', 343),
 ('typical', 343),
 ('baby', 342),
 ('crazy', 342),
 ('ideas', 342),
 ('meant', 341),
 ('loved', 341),
 ('fire', 340),
 ('worked', 340),
 ('unbelievable', 339),
 ('follow', 339),
 ('theme', 337),
 ('producers', 336),
 ('twist', 336),
 ('barely', 336),
 ('appear', 336),
 ('plus', 336),
 ('team', 335),
 ('directors', 335),
 ('viewers', 333),
 ('leads', 332),
 ('tom', 332),
 ('slasher', 332),
 ('working', 331),
 ('villain', 331),
 ('wrote', 331),
 ('gun', 331),
 ('strong', 330),
 ('realize', 330),
 ('island', 330),
 ('open', 330),
 ('positive', 329),
 ('yeah', 329),
 ('quickly', 329),
 ('disappointing', 329),
 ('release', 328),
 ('simple', 328),
 ('weren', 328),
 ('honestly', 328),
 ('eventually', 327),
 ('period', 327),
 ('doctor', 327),
 ('kills', 327),
 ('tells', 327),
 ('herself', 326),
 ('dog', 326),
 ('list', 326),
 ('nowhere', 326),
 ('acted', 326),
 ('walk', 325),
 ('apart', 324),
 ('air', 324),
 ('subject', 323),
 ('makers', 323),
 ('fi', 322),
 ('learn', 322),
 ('sci', 319),
 ('admit', 319),
 ('bother', 319),
 ('hands', 318),
 ('disappointment', 318),
 ('note', 318),
 ('jack', 318),
 ('certain', 317),
 ('value', 317),
 ('casting', 317),
 ('e', 317),
 ('peter', 316),
 ('grade', 316),
 ('missing', 315),
 ('suddenly', 315),
 ('stick', 313),
 ('form', 313),
 ('previous', 313),
 ('break', 313),
 ('soundtrack', 312),
 ('surprised', 311),
 ('expecting', 311),
 ('front', 311),
 ('relationship', 310),
 ('parents', 310),
 ('surprise', 310),
 ('shoot', 309),
 ('today', 309),
 ('ended', 308),
 ('ways', 308),
 ('vampire', 308),
 ('leaves', 308),
 ('somewhere', 308),
 ('concept', 308),
 ('creepy', 308),
 ('painful', 308),
 ('spend', 307),
 ('th', 307),
 ('effect', 306),
 ('difficult', 306),
 ('fighting', 306),
 ('street', 306),
 ('future', 306),
 ('america', 305),
 ('c', 305),
 ('accent', 304),
 ('project', 302),
 ('truth', 302),
 ('f', 301),
 ('deal', 301),
 ('indeed', 301),
 ('joe', 301),
 ('biggest', 300),
 ('rate', 300),
 ('japanese', 299),
 ('paul', 299),
 ('utterly', 298),
 ('redeeming', 298),
 ('college', 298),
 ('begins', 298),
 ('disney', 297),
 ('fairly', 297),
 ('york', 297),
 ('create', 296),
 ('crew', 296),
 ('revenge', 296),
 ('cartoon', 296),
 ('outside', 295),
 ('computer', 295),
 ('co', 295),
 ('interested', 295),
 ('stage', 295),
 ('among', 294),
 ('considering', 294),
 ('speak', 294),
 ('sick', 293),
 ('towards', 293),
 ('channel', 293),
 ('cause', 292),
 ('particular', 292),
 ('hair', 292),
 ('van', 292),
 ('talented', 292),
 ('bottom', 291),
 ('reasons', 291),
 ('mediocre', 290),
 ('cat', 290),
 ('telling', 290),
 ('store', 289),
 ('supporting', 289),
 ('hoping', 288),
 ('waiting', 288),
 ...]

As you can see, common words like "the" appear very often in both positive and negative reviews. Instead of finding the most common words in positive or negative reviews, what you really want are the words found in positive reviews more often than in negative reviews, and vice versa. To accomplish this, you'll need to calculate the ratios of word usage between positive and negative reviews.

In [12]:
pos_neg_ratios = Counter()

# Calculate the ratios of positive and negative uses of the most common words
# Consider words to be "common" if they've been used at least 100 times
for term,cnt in list(total_counts.most_common()):
    if(cnt > 100):
        pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
        pos_neg_ratios[term] = pos_neg_ratio

Examine the ratios you've calculated for a few words:

In [13]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios["terrible"]))
Pos-to-neg ratio for 'the' = 1.0607993145235326
Pos-to-neg ratio for 'amazing' = 4.022813688212928
Pos-to-neg ratio for 'terrible' = 0.17744252873563218
In [14]:
# Convert ratios to logs
for word,ratio in pos_neg_ratios.most_common():
    pos_neg_ratios[word] = np.log(ratio)

Examine the new ratios you've calculated for the same words from before:

In [15]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios["terrible"]))
Pos-to-neg ratio for 'the' = 0.05902269426102881
Pos-to-neg ratio for 'amazing' = 1.3919815802404802
Pos-to-neg ratio for 'terrible' = -1.7291085042663878
In [16]:
# words most frequently seen in a review with a "POSITIVE" label
pos_neg_ratios.most_common()
Out[16]:
[('edie', 4.6913478822291435),
 ('paulie', 4.07753744390572),
 ('felix', 3.152736022363656),
 ('polanski', 2.8233610476132043),
 ('matthau', 2.80672172860924),
 ('victoria', 2.681021528714291),
 ('mildred', 2.6026896854443837),
 ('gandhi', 2.538973871058276),
 ('flawless', 2.451005098112319),
 ('superbly', 2.26002547857525),
 ('perfection', 2.159484249353372),
 ('astaire', 2.1400661634962708),
 ('captures', 2.038619547159581),
 ('voight', 2.030170492673053),
 ('wonderfully', 2.0218960560332353),
 ('powell', 1.978345424808467),
 ('brosnan', 1.9547990964725592),
 ('lily', 1.9203768470501485),
 ('bakshi', 1.9029851043382795),
 ('lincoln', 1.9014583864844796),
 ('refreshing', 1.8551812956655511),
 ('breathtaking', 1.8481124057791867),
 ('bourne', 1.8478489358790986),
 ('lemmon', 1.8458266904983307),
 ('delightful', 1.8002701588959635),
 ('flynn', 1.7996646487351682),
 ('andrews', 1.7764919970972666),
 ('homer', 1.7692866133759964),
 ('beautifully', 1.7626953362841438),
 ('soccer', 1.7578579175523736),
 ('elvira', 1.739703107272002),
 ('underrated', 1.7197859696029656),
 ('gripping', 1.7165360479904674),
 ('superb', 1.7091514458966952),
 ('delight', 1.6714733033535532),
 ('welles', 1.667706820558076),
 ('sadness', 1.663505133704376),
 ('sinatra', 1.6389967146756448),
 ('touching', 1.637217476541176),
 ('timeless', 1.62924053973028),
 ('macy', 1.6211339521972916),
 ('unforgettable', 1.6177367152487956),
 ('favorites', 1.6158688027643908),
 ('stewart', 1.611998733295774),
 ('hartley', 1.6094379124341003),
 ('sullivan', 1.6094379124341003),
 ('extraordinary', 1.6094379124341003),
 ('brilliantly', 1.5950491749820008),
 ('friendship', 1.5677652160335325),
 ('wonderful', 1.5645425925262093),
 ('palma', 1.5553706911638245),
 ('magnificent', 1.54663701119507),
 ('finest', 1.546259010812569),
 ('jackie', 1.5439233053234738),
 ('ritter', 1.540445040947149),
 ('tremendous', 1.5184661342283736),
 ('freedom', 1.5091151908062312),
 ('fantastic', 1.5048433868558566),
 ('terrific', 1.5026699370083942),
 ('sidney', 1.493925025312256),
 ('noir', 1.493925025312256),
 ('outstanding', 1.4910053152089213),
 ('pleasantly', 1.4894785973551214),
 ('mann', 1.4894785973551214),
 ('nancy', 1.488077055429833),
 ('marie', 1.4825711915553104),
 ('marvelous', 1.4739999415389962),
 ('excellent', 1.4647538505723599),
 ('ruth', 1.4596256342054401),
 ('stanwyck', 1.4412101187160054),
 ('widmark', 1.4350845252893227),
 ('splendid', 1.4271163556401458),
 ('chan', 1.423108334242607),
 ('exceptional', 1.4201959127955721),
 ('tender', 1.410986973710262),
 ('gentle', 1.4078005663408544),
 ('poignant', 1.4022947024663317),
 ('gem', 1.3932148039644643),
 ('amazing', 1.3919815802404802),
 ('chilling', 1.3862943611198906),
 ('captivating', 1.3862943611198906),
 ('davies', 1.3862943611198906),
 ('fisher', 1.3862943611198906),
 ('darker', 1.3652409519220583),
 ('april', 1.349926716949016),
 ('kelly', 1.3461743673304654),
 ('blake', 1.3418425985490567),
 ('overlooked', 1.329135947279942),
 ('ralph', 1.32818673031261),
 ('bette', 1.3156767939059373),
 ('hoffman', 1.315066851831523),
 ('cole', 1.3121863889661687),
 ('shines', 1.3049487216659381),
 ('powerful', 1.2999662776313934),
 ('notch', 1.2950456896547455),
 ('remarkable', 1.2883688239495823),
 ('pitt', 1.286210902562908),
 ('winters', 1.2833463918674481),
 ('vivid', 1.2762934659055623),
 ('gritty', 1.2757524867200667),
 ('giallo', 1.274502955131774),
 ('portrait', 1.270462545594769),
 ('innocence', 1.2694300209805796),
 ('psychiatrist', 1.2685113254635072),
 ('favorite', 1.2668956297860055),
 ('ensemble', 1.2656663733312759),
 ('stunning', 1.2622417124499117),
 ('burns', 1.259880436264232),
 ('garbo', 1.258954938743289),
 ('barbara', 1.2580400255962119),
 ('philip', 1.252762968495368),
 ('panic', 1.252762968495368),
 ('holly', 1.252762968495368),
 ('carol', 1.2481440226390734),
 ('perfect', 1.246742480713785),
 ('appreciated', 1.2462482874741743),
 ('favourite', 1.2411123512753928),
 ('journey', 1.236762627148927),
 ('rural', 1.235471471385307),
 ('bond', 1.2321436812926323),
 ('builds', 1.2305398317106577),
 ('brilliant', 1.2287554137664785),
 ('brooklyn', 1.2286654169163074),
 ('von', 1.225175011976539),
 ('unfolds', 1.2163953243244932),
 ('recommended', 1.2163953243244932),
 ('daniel', 1.20215296760895),
 ('perfectly', 1.1971931173405572),
 ('crafted', 1.1962507582320256),
 ('prince', 1.1939224684724346),
 ('troubled', 1.192138346678933),
 ('consequences', 1.1865810616140668),
 ('haunting', 1.1814999484738773),
 ('cinderella', 1.180052620608284),
 ('alexander', 1.17599895228353),
 ('emotions', 1.1753049094563641),
 ('boxing', 1.1735135968412274),
 ('subtle', 1.173413501750808),
 ('curtis', 1.1649873576129823),
 ('rare', 1.1566438362402944),
 ('loved', 1.1563661500586044),
 ('daughters', 1.1526795099383853),
 ('courage', 1.1438688802562305),
 ('dentist', 1.1426722784621401),
 ('highly', 1.1420208631618658),
 ('nominated', 1.1409146683587992),
 ('tony', 1.139749194228599),
 ('draws', 1.132513840343791),
 ('everyday', 1.1306150197542835),
 ('contrast', 1.128465251817791),
 ('cried', 1.121340539745666),
 ('fabulous', 1.1210851445201684),
 ('ned', 1.120591195386885),
 ('fay', 1.120591195386885),
 ('emma', 1.1184149159642893),
 ('sensitive', 1.113318436057805),
 ('smooth', 1.1089750757036563),
 ('dramas', 1.1080910326226534),
 ('today', 1.1050431789984),
 ('helps', 1.1023091505494358),
 ('inspiring', 1.0986122886681098),
 ('jimmy', 1.0937696641923216),
 ('awesome', 1.0931328229034842),
 ('unique', 1.0881409888008142),
 ('tragic', 1.0871835928444868),
 ('intense', 1.0870514662670339),
 ('stellar', 1.0857088838322018),
 ('rival', 1.0822184788924332),
 ('provides', 1.079708134028957),
 ('depression', 1.0782034170369026),
 ('shy', 1.0775588794702773),
 ('carrie', 1.076139432816051),
 ('blend', 1.0753554265038423),
 ('hank', 1.0736109864626924),
 ('diana', 1.072636802264849),
 ('adorable', 1.072636802264849),
 ('unexpected', 1.0722255334949147),
 ('achievement', 1.0668635903535293),
 ('bettie', 1.0663514264498881),
 ('happiness', 1.0632729222228008),
 ('glorious', 1.0608719606852626),
 ('davis', 1.0541605260972757),
 ('terrifying', 1.0525211814678428),
 ('beauty', 1.050410186850232),
 ('ideal', 1.0479685558493548),
 ('fears', 1.0467872208035236),
 ('hong', 1.0438040521731147),
 ('seasons', 1.0433496099930604),
 ('fascinating', 1.0414538748281612),
 ('carries', 1.0345904299031787),
 ('satisfying', 1.0321225473992768),
 ('definite', 1.0319209141694374),
 ('touched', 1.0296194171811581),
 ('greatest', 1.0248947127715422),
 ('creates', 1.0241097613701886),
 ('aunt', 1.023388867430522),
 ('walter', 1.022328983918479),
 ('spectacular', 1.0198314108149955),
 ('portrayal', 1.0189810189761024),
 ('ann', 1.0127808528183286),
 ('enterprise', 1.0116009116784799),
 ('musicals', 1.0096648026516135),
 ('deeply', 1.0094845087721023),
 ('incredible', 1.0061677561461084),
 ('mature', 1.0060195018402847),
 ('margaret', 0.9968295943581673),
 ('triumph', 0.9968295943581673),
 ('navy', 0.9949338591932683),
 ('harry', 0.9917691930500606),
 ('lucas', 0.990398704027877),
 ('sweet', 0.9896611048795548),
 ('joey', 0.9879467207805901),
 ('oscar', 0.9872190511104971),
 ('balance', 0.9864949905474035),
 ('warm', 0.9848534033114517),
 ('ages', 0.9844989819006886),
 ('guilt', 0.9808292530117262),
 ('glover', 0.9808292530117262),
 ('carrey', 0.9808292530117262),
 ('learns', 0.978811088855489),
 ('unusual', 0.9778837427819693),
 ('sons', 0.977775815524836),
 ('complex', 0.977618977381478),
 ('essence', 0.9775343571148737),
 ('brazil', 0.9769153536905899),
 ('widow', 0.9765095918672099),
 ('solid', 0.9753796482441615),
 ('beautiful', 0.9732630126284105),
 ('holmes', 0.9724610033412096),
 ('awe', 0.9718605830289658),
 ('vhs', 0.9711673420999893),
 ('eerie', 0.9711673420999893),
 ('lonely', 0.9687372072466975),
 ('grim', 0.9687372072466975),
 ('sport', 0.9682504708048661),
 ('debut', 0.965080896043587),
 ('destiny', 0.963437510299857),
 ('thrillers', 0.9628107475090479),
 ('tears', 0.9597758438138939),
 ('rose', 0.9566420273977225),
 ('feelings', 0.9555114450274363),
 ('ginger', 0.9555114450274363),
 ('winning', 0.9547181090080405),
 ('stanley', 0.953873443023198),
 ('cox', 0.9534302788236119),
 ('paris', 0.9527847903047266),
 ('heart', 0.9523880692451681),
 ('hooked', 0.951558870711613),
 ('comfortable', 0.9480394301887354),
 ('mgm', 0.9444616088408515),
 ('masterpiece', 0.941550398633393),
 ('themes', 0.9411882834958823),
 ('danny', 0.9396711805182187),
 ('anime', 0.9337838893216722),
 ('perry', 0.9332883082427261),
 ('joy', 0.9330175256794686),
 ('lovable', 0.9308188324370649),
 ('mysteries', 0.9295359586241757),
 ('hal', 0.9295359586241757),
 ('louis', 0.9287132518727123),
 ('charming', 0.9252060955321074),
 ('urban', 0.9236708391717776),
 ('allows', 0.9218309122497704),
 ('impact', 0.9181581460489504),
 ('gradually', 0.9162907318741551),
 ('lifestyle', 0.9162907318741551),
 ('italy', 0.9162907318741551),
 ('spy', 0.9128951428730169),
 ('treat', 0.9119334265051994),
 ('subsequent', 0.9105600571651701),
 ('kennedy', 0.9098182173685376),
 ('loving', 0.9096754927554359),
 ('surprising', 0.9093702890295813),
 ('quiet', 0.9064867317775342),
 ('winter', 0.9062403960206536),
 ('reveals', 0.9049054096490298),
 ('raw', 0.9044562742271522),
 ('funniest', 0.9007865453381899),
 ('pleased', 0.8999415938726256),
 ('norman', 0.8999415938726256),
 ('thief', 0.8987464222232455),
 ('season', 0.8982722263714767),
 ('secrets', 0.8979415932059586),
 ('colorful', 0.8970593699462676),
 ('highest', 0.8967461358011849),
 ('compelling', 0.8946292350929758),
 ('danes', 0.8924800831804366),
 ('castle', 0.889677083356065),
 ('kudos', 0.8888917576860407),
 ('great', 0.8881047090146459),
 ('subtitles', 0.8873031950009027),
 ('baseball', 0.8873031950009027),
 ('bleak', 0.8873031950009027),
 ('winner', 0.8864377687244739),
 ('tragedy', 0.8856369907831526),
 ('todd', 0.8855190732074014),
 ('nicely', 0.879249460193806),
 ('arthur', 0.8754687373538999),
 ('essential', 0.8737311174553593),
 ('gorgeous', 0.8731725250935497),
 ('fonda', 0.8729402910005413),
 ('eastwood', 0.871395411966264),
 ('focuses', 0.8708283577973978),
 ('enjoyed', 0.8707019595162461),
 ('natural', 0.8699792450691284),
 ('intensity', 0.868351269585036),
 ('witty', 0.8682410342324468),
 ('rob', 0.8642954367557748),
 ('worlds', 0.8637726975907087),
 ('health', 0.861138911799075),
 ('magical', 0.8595379152817056),
 ('deeper', 0.8580218237501793),
 ('lucy', 0.8561868078044496),
 ('moving', 0.8556661100577203),
 ('lovely', 0.8529064000468131),
 ('purple', 0.8513711857748395),
 ('memorable', 0.8480118911208606),
 ('sings', 0.8472978603872037),
 ('modesty', 0.8434293836092832),
 ('craig', 0.8434293836092832),
 ('relate', 0.8432655968592652),
 ('episodes', 0.8422371208413729),
 ('strong', 0.8416713577706093),
 ('smith', 0.8395981110859005),
 ('tear', 0.8370413602200144),
 ('apartment', 0.8333311529054953),
 ('disagree', 0.8329091229351039),
 ('princess', 0.8329091229351039),
 ('kung', 0.831733343846092),
 ('adventure', 0.8315056139327839),
 ('columbo', 0.8266785731844679),
 ('jake', 0.8266785731844679),
 ('adds', 0.8248565259145232),
 ('hart', 0.8247235383486646),
 ('strength', 0.8241754429663494),
 ('realizes', 0.8236000689573806),
 ('dave', 0.8232003088081431),
 ('childhood', 0.8220808639358386),
 ('forbidden', 0.8198988861990891),
 ('tight', 0.818835395723442),
 ('surreal', 0.8178506590609026),
 ('manager', 0.8177099032017076),
 ('dancer', 0.8157495026522776),
 ('studios', 0.8109302162163288),
 ('con', 0.8109302162163288),
 ('miike', 0.8082165103447326),
 ('realistic', 0.8080771472339223),
 ('explicit', 0.8079226951523736),
 ('kurt', 0.8060875917405409),
 ('traditional', 0.8053591711668733),
 ('deals', 0.8053591711668733),
 ('holds', 0.8049385865480619),
 ('carl', 0.8043728156701697),
 ('touches', 0.8039615469002355),
 ('gene', 0.8031480757742738),
 ('albert', 0.8027669055771679),
 ('abc', 0.8023464725249373),
 ('cry', 0.8001193001121131),
 ('sides', 0.7995275841185171),
 ('eyre', 0.7985076962177716),
 ('develops', 0.7985076962177716),
 ('dances', 0.7969439742415889),
 ('oscars', 0.7963314167951762),
 ('legendary', 0.7960045659996531),
 ('hearted', 0.7949298748698876),
 ('importance', 0.7949298748698876),
 ('portraying', 0.7935659283069927),
 ('impressed', 0.7925810775481322),
 ('waters', 0.7911275889201491),
 ('empire', 0.7907856501238614),
 ('edge', 0.789774016249017),
 ('environment', 0.7884573603642703),
 ('jean', 0.7884573603642703),
 ('sentimental', 0.7864791203521645),
 ('captured', 0.7862376036259573),
 ('styles', 0.7859289140109116),
 ('daring', 0.7859289140109116),
 ('matches', 0.7827593392496325),
 ('tense', 0.7827593392496325),
 ('frank', 0.7827593392496325),
 ('backgrounds', 0.7827593392496325),
 ('gothic', 0.7820946665764414),
 ('sharp', 0.7814397877056235),
 ('achieved', 0.780158557549575),
 ('court', 0.7794752640484425),
 ('steals', 0.7789140023173704),
 ('rules', 0.7784447610718404),
 ('colors', 0.7768461994365922),
 ('reunion', 0.7731898882334817),
 ('covers', 0.7713993774596934),
 ('tale', 0.7701082216960737),
 ('rain', 0.7683706017975328),
 ('denzel', 0.768048488733063),
 ('stays', 0.7678707267558819),
 ('blob', 0.7672551527136672),
 ('maria', 0.7621400520468967),
 ('conventional', 0.7621400520468967),
 ('fresh', 0.7615843421131738),
 ('midnight', 0.7609697768987064),
 ('landscape', 0.758529939822797),
 ('animated', 0.7576857016975165),
 ('sunday', 0.7566605862822713),
 ('titanic', 0.7566605862822713),
 ('cagney', 0.7537718023763802),
 ('spring', 0.7537718023763802),
 ('enjoyable', 0.7524637577163648),
 ('immensely', 0.7519876805828787),
 ('sir', 0.7507762933965817),
 ('nevertheless', 0.7506710246981319),
 ('driven', 0.7499447789530785),
 ('performances', 0.7488325251606314),
 ('nowadays', 0.7472144018302211),
 ('memories', 0.7472144018302211),
 ('simple', 0.7464142097414326),
 ('leslie', 0.7453329337305156),
 ('golden', 0.7453329337305156),
 ('lovers', 0.7449722484245312),
 ('relationship', 0.7448423234560179),
 ('supporting', 0.7435780341868372),
 ('che', 0.742627237823315),
 ('packed', 0.7410032017375805),
 ('trek', 0.7402146914179311),
 ('provoking', 0.7384037721480662),
 ('strikes', 0.7375989431307791),
 ('depiction', 0.736822244062607),
 ('emotional', 0.7367821164568152),
 ('secretary', 0.7366322924996842),
 ('florida', 0.7351113796589775),
 ('influenced', 0.7351113796589775),
 ('germany', 0.7328875092094594),
 ('brings', 0.7314293671309623),
 ('lewis', 0.7312989465243216),
 ('elderly', 0.7308875085427924),
 ('owner', 0.7274362540385775),
 ('streets', 0.726669872598589),
 ('henry', 0.7264219694448174),
 ('portrays', 0.7259370033829363),
 ('bears', 0.7252354951114458),
 ('china', 0.7248958788745256),
 ('anger', 0.7243997240640498),
 ('society', 0.7243301079966333),
 ('available', 0.7241574173025055),
 ('best', 0.7234703406044631),
 ('bugs', 0.7227059828014898),
 ('magic', 0.718789611173283),
 ('delivers', 0.7184649885442351),
 ('verhoeven', 0.7184649885442351),
 ('jim', 0.7178397931503168),
 ('donald', 0.7166776779701394),
 ('endearing', 0.714653385780909),
 ('relationships', 0.713937950229019),
 ('greatly', 0.7125652664170469),
 ('brad', 0.7102416139192453),
 ('charlie', 0.7102416139192453),
 ('simon', 0.7096764825111558),
 ('effectively', 0.7091475219063864),
 ('march', 0.7077459799810979),
 ('atmosphere', 0.7074477307021416),
 ('influence', 0.7073318155519017),
 ('genius', 0.706392407309966),
 ('emotionally', 0.7055697005585024),
 ('ken', 0.7052685410922901),
 ('identity', 0.7048432203231365),
 ('sophisticated', 0.7047080029610213),
 ('dan', 0.7045758763835681),
 ('andrew', 0.7032995520239632),
 ('india', 0.7014459833746404),
 ('roy', 0.6997045811061043),
 ('surprisingly', 0.6995780708902356),
 ('sky', 0.6978091936657567),
 ('romantic', 0.6966498111111474),
 ('match', 0.6956692499926552),
 ('meets', 0.6931471805599453),
 ('cowboy', 0.6931471805599453),
 ('bitter', 0.6931471805599453),
 ('britain', 0.6931471805599453),
 ('stylish', 0.6931471805599453),
 ('affected', 0.6931471805599453),
 ('patient', 0.6931471805599453),
 ('wave', 0.6931471805599453),
 ('beatty', 0.6931471805599453),
 ('love', 0.6919853354193732),
 ('paul', 0.6898082792944307),
 ('andy', 0.688463331247519),
 ('performance', 0.6879738632797247),
 ('patrick', 0.6864581924091486),
 ('unlike', 0.6854646843879291),
 ('brooks', 0.6843365508777904),
 ('refuses', 0.6834852696482084),
 ('award', 0.6824518914431974),
 ('complaint', 0.6824518914431974),
 ('ride', 0.6822971645358795),
 ('dawson', 0.6817184847363226),
 ('luke', 0.6815863581588694),
 ('wells', 0.680877087968131),
 ('france', 0.6804081547825156),
 ('sports', 0.6800750989925926),
 ('handsome', 0.6800750989925926),
 ('directs', 0.6787584431078457),
 ('rebel', 0.6787584431078457),
 ('greater', 0.6760527472006452),
 ('dreams', 0.6759941013336959),
 ('effective', 0.6756540231124281),
 ('interpretation', 0.6747980418917487),
 ('works', 0.6744550475477928),
 ('brando', 0.6744550475477928),
 ('noble', 0.6737290947028437),
 ('paced', 0.6731465138532757),
 ('le', 0.6706743247078867),
 ('master', 0.6701576623352465),
 ('h', 0.6696166831497512),
 ('rings', 0.6690496289808848),
 ('easy', 0.6689599549459415),
 ('city', 0.6682082322126932),
 ('sunshine', 0.6678293725756554),
 ('succeeds', 0.666478933477784),
 ('relations', 0.664159643686693),
 ('england', 0.663876798259832),
 ('glimpse', 0.6632942174102642),
 ('aired', 0.6626879730752367),
 ('sees', 0.6626316366339948),
 ('both', 0.66248336767383),
 ('definitely', 0.6619978948389881),
 ('imaginative', 0.661398482245365),
 ('appreciate', 0.6608389373272875),
 ('tricks', 0.6607119048067914),
 ('striking', 0.6607119048067914),
 ('carefully', 0.6599949732430448),
 ('complicated', 0.6598107602923535),
 ('perspective', 0.6596244885213017),
 ('trilogy', 0.6587795370557376),
 ('future', 0.6583466514105283),
 ('lion', 0.6574290979578661),
 ('victor', 0.6554068525770982),
 ('douglas', 0.6554068525770982),
 ('inspired', 0.6545985104427103),
 ('marriage', 0.653926467406664),
 ('demands', 0.653926467406664),
 ('father', 0.6517232167219466),
 ('page', 0.6512362849443085),
 ('instant', 0.6505875661411494),
 ('era', 0.6495567444850836),
 ('saga', 0.6493445579015524),
 ('ruthless', 0.6493445579015524),
 ('joan', 0.6489139255831198),
 ('joseph', 0.6484112867185539),
 ('workers', 0.6482966143945935),
 ('fantasy', 0.6472675748092517),
 ('accomplished', 0.6455191315706907),
 ('distant', 0.6455191315706907),
 ('manhattan', 0.6443570163905132),
 ('personal', 0.6435502394205732),
 ('individual', 0.6431367599852839),
 ('pushing', 0.6431367599852839),
 ('meeting', 0.6431367599852839),
 ('pleasant', 0.6425034477411904),
 ('brave', 0.6418538861723947),
 ('william', 0.6408313911957847),
 ('hudson', 0.6407791950426294),
 ('friendly', 0.6394944670676251),
 ('eccentric', 0.6390799592896695),
 ('awards', 0.6387531084941465),
 ('jack', 0.6383830951499704),
 ('seeking', 0.6380874033769178),
 ('colonel', 0.6375773294051346),
 ('divorce', 0.6375773294051346),
 ('jane', 0.6344395797331673),
 ('keeping', 0.6341488397979895),
 ('gives', 0.6338356815949788),
 ('ted', 0.633427945858323),
 ('animation', 0.632086923798699),
 ('progress', 0.6317782341836532),
 ('concert', 0.6312717768418578),
 ('larger', 0.6312717768418578),
 ('nation', 0.6296337748376194),
 ('albeit', 0.6273958029971649),
 ('adapted', 0.6261364702769852),
 ('discovers', 0.6254290065049944),
 ('classic', 0.6250495642805052),
 ('segment', 0.6233514186244034),
 ('morgan', 0.6230376143729187),
 ('mouse', 0.6229429218866968),
 ('impressive', 0.6221114074431935),
 ('artist', 0.6216882165778004),
 ('ultimate', 0.6216882165778004),
 ('griffith', 0.621173680934856),
 ('drew', 0.6208265189803192),
 ('emily', 0.6208265189803192),
 ('moved', 0.6197197120051281),
 ('profound', 0.6190392084062235),
 ('families', 0.6190392084062235),
 ('innocent', 0.6185121991713645),
 ('versions', 0.6173091041684409),
 ('eddie', 0.6169198151720611),
 ('criticism', 0.6165139545390294),
 ('nature', 0.6159451465319409),
 ('recognized', 0.6151856390902335),
 ('sexuality', 0.6146755651184501),
 ('contract', 0.6140098600012215),
 ('brian', 0.6134404379492028),
 ('remembered', 0.6131044728864089),
 ('determined', 0.6123858239154869),
 ('offers', 0.6120793574711635),
 ('pleasure', 0.611957025829932),
 ('washington', 0.6118015411059929),
 ('images', 0.6115973135958376),
 ('games', 0.6106709587357068),
 ('academy', 0.6087298387473621),
 ('fashioned', 0.6079893722196384),
 ('melodrama', 0.6074917359814515),
 ('peoples', 0.6061358035703155),
 ('charismatic', 0.6061358035703155),
 ('rough', 0.6061358035703155),
 ('dealing', 0.6051784076139881),
 ('fine', 0.604969622680133),
 ('tap', 0.6039160468320027),
 ('trio', 0.6015799870344548),
 ('russell', 0.6012096852342597),
 ('figures', 0.6007738604289301),
 ('ward', 0.6000567574939334),
 ('brady', 0.5991182309116689),
 ('shine', 0.5991182309116689),
 ('job', 0.5984556212516866),
 ('satisfied', 0.5965203448708737),
 ('river', 0.5963796286249509),
 ('brown', 0.595773016534769),
 ('believable', 0.595660721333025),
 ('always', 0.5947071077466928),
 ('bound', 0.5947071077466928),
 ('hall', 0.5933967777928858),
 ('cook', 0.5916777203950857),
 ('claire', 0.5913644862500029),
 ('broadway', 0.5903376866937243),
 ('anna', 0.5877866649021191),
 ('peace', 0.5862840350175841),
 ('visually', 0.5853943192634992),
 ('morality', 0.5852582185487603),
 ('falk', 0.5852582185487603),
 ('growing', 0.5846665375658754),
 ('stood', 0.5831462853456169),
 ('experiences', 0.5831462853456169),
 ('touch', 0.58122926435596),
 ('lives', 0.5810976767513224),
 ('kubrick', 0.5806691971332549),
 ('timing', 0.5804740180558324),
 ('expressions', 0.5798184952529422),
 ('struggles', 0.5798184952529422),
 ('authentic', 0.5784842722398056),
 ('helen', 0.5776342934381009),
 ('pre', 0.5770075306472918),
 ('quirky', 0.5753641449035618),
 ('young', 0.5753167234453431),
 ('inner', 0.5745414381520985),
 ('mexico', 0.5744308737205633),
 ('clint', 0.5738004229273791),
 ('sisters', 0.5728610146854434),
 ('realism', 0.5722652889994956),
 ('personalities', 0.5720692490067093),
 ('french', 0.5720692490067093),
 ('adventures', 0.5711322299969818),
 ('surprises', 0.5711322299969818),
 ('overcome', 0.5697681593994407),
 ('timothy', 0.5695332245927687),
 ('tales', 0.5690945318899664),
 ('war', 0.5684331730278168),
 ('civil', 0.5679840376059393),
 ('countries', 0.5673777932709119),
 ('streep', 0.5671064596645803),
 ('tradition', 0.5668534552356532),
 ('oliver', 0.5667332557042867),
 ('australia', 0.5658077581833438),
 ('understanding', 0.5653138090500605),
 ('players', 0.5650952537000482),
 ('knowing', 0.5648928450362665),
 ('rogers', 0.5642134971840521),
 ('variety', 0.5636891133230585),
 ('suspenseful', 0.5636891133230585),
 ('true', 0.5628152518081007),
 ('jr', 0.5622098231124694),
 ('psychological', 0.5610874585468789),
 ('reminiscent', 0.5596157879354227),
 ('wealth', 0.5596157879354227),
 ('overwhelming', 0.5596157879354227),
 ('odds', 0.5596157879354227),
 ('branagh', 0.5596157879354227),
 ('grand', 0.5596157879354227),
 ('performing', 0.5596157879354227),
 ('sent', 0.5596157879354227),
 ('brothers', 0.5589118104336285),
 ('howard', 0.5581108967560025),
 ('david', 0.5569312225647537),
 ('generation', 0.556287997842748),
 ('grow', 0.5561253829956542),
 ('survival', 0.5559460590464603),
 ('mainstream', 0.5557473111575023),
 ('dick', 0.5543107357057295),
 ('charm', 0.5528817557540786),
 ('kirk', 0.5527898228650229),
 ('twists', 0.5524472984568102),
 ('gangster', 0.5520685823000399),
 ('jeff', 0.5517930622542137),
 ('family', 0.5511624451006553),
 ('tend', 0.5505330733611034),
 ('thanks', 0.5504908801584222),
 ('world', 0.5474423472343264),
 ('sutherland', 0.5474353693785516),
 ('life', 0.5469551443495992),
 ('disc', 0.5465437063680699),
 ('bug', 0.5465437063680699),
 ('tribute', 0.5455111817538808),
 ('europe', 0.5452270504833231),
 ('sacrifice', 0.5443015529623801),
 ('color', 0.5440512713943111),
 ('superior', 0.5433349023312852),
 ('york', 0.5431823586653651),
 ('pulls', 0.5426662296216495),
 ('hearts', 0.5423242908253617),
 ('jackson', 0.5423242908253617),
 ('enjoy', 0.5412428513590611),
 ('redemption', 0.5405675929647282),
 ('madness', 0.540384426007535),
 ('trial', 0.5389965007326869),
 ('greek', 0.5389965007326869),
 ('stands', 0.5389965007326869),
 ('hamilton', 0.5389965007326869),
 ('each', 0.5388212312554177),
 ('faithful', 0.5377330766859151),
 ('received', 0.5372768098531604),
 ('documentaries', 0.537142932083364),
 ('jealous', 0.537142932083364),
 ('different', 0.5370986068246082),
 ('describes', 0.5368011101692514),
 ('shorts', 0.5359615970375329),
 ('brilliance', 0.5355182363563621),
 ('mountains', 0.5349231753450512),
 ('dealt', 0.5340824859302579),
 ('share', 0.5340824859302579),
 ('explore', 0.5332984796180493),
 ('providing', 0.5332984796180493),
 ('series', 0.5325809226575603),
 ('fellow', 0.5323318289869543),
 ('olivier', 0.5306282510621704),
 ('revolution', 0.5306282510621704),
 ('roman', 0.5306282510621704),
 ('loves', 0.5306282510621704),
 ('century', 0.5300278307499267),
 ('musical', 0.5296687115674706),
 ('heroic', 0.5292593254548287),
 ('ironically', 0.5280674302004967),
 ('temple', 0.5280674302004967),
 ('approach', 0.5280674302004967),
 ('moves', 0.5279372642387119),
 ('gift', 0.5270203096859714),
 ('julie', 0.5260930958967791),
 ('tells', 0.52415107836314),
 ('radio', 0.5239467117286878),
 ('uncle', 0.5235443961737654),
 ('union', 0.5232481437645479),
 ('deep', 0.523095716357805),
 ('reminds', 0.5215784155422524),
 ('famous', 0.5211884108015372),
 ('jazz', 0.5205344378929515),
 ('dennis', 0.5198754592859086),
 ('epic', 0.5191938734365074),
 ('adult', 0.519167695083386),
 ('shows', 0.519153222203753),
 ('performed', 0.5191244265806858),
 ('demons', 0.5191244265806858),
 ('discovered', 0.5187937934151675),
 ('eric', 0.5187937934151675),
 ('youth', 0.5185626062681431),
 ('human', 0.5185141122498709),
 ('tarzan', 0.5181382706122772),
 ('ourselves', 0.5179430915348546),
 ('wwii', 0.5175824062288704),
 ('passion', 0.5162164724008671),
 ('desire', 0.5160749796521344),
 ('pays', 0.5158131652770298),
 ('fox', 0.5155762265245886),
 ('dirty', 0.5155762265245886),
 ('sympathetic', 0.5154660033224929),
 ('symbolism', 0.5154660033224929),
 ('attitude', 0.5153099362133193),
 ('jeremy', 0.5146644000731564),
 ('appearances', 0.5146644000731564),
 ('fun', 0.5143906899304869),
 ('south', 0.5142097217502312),
 ('arrives', 0.5140989491109599),
 ('present', 0.5134196589430373),
 ('com', 0.5132616785638717),
 ('smile', 0.5126588048476517),
 ('countryside', 0.5108256237659907),
 ('fits', 0.5108256237659907),
 ('visit', 0.5108256237659907),
 ('carter', 0.5108256237659907),
 ('ring', 0.5108256237659907),
 ('provided', 0.5108256237659907),
 ('aging', 0.5108256237659907),
 ('alan', 0.5108256237659907),
 ('begins', 0.5101565036339665),
 ('japan', 0.5090057870490047),
 ('success', 0.5090057870490047),
 ('accurate', 0.5089547158301789),
 ('proud', 0.5080047474243493),
 ('daily', 0.5075946031845443),
 ('atmospheric', 0.5072478024181067),
 ('karloff', 0.5072478024181067),
 ('recently', 0.5071491490366821),
 ('fu', 0.5070449009260847),
 ('horrors', 0.5065612249795332),
 ('finding', 0.5063712734166104),
 ('lust', 0.5059356384717989),
 ('hitchcock', 0.50574947073413),
 ('among', 0.5033400495133273),
 ('viewing', 0.5030213982744091),
 ('shining', 0.5026288565618122),
 ('investigation', 0.5026288565618122),
 ('duo', 0.5020919437972361),
 ('cameron', 0.5020919437972361),
 ('finds', 0.501283031005398),
 ('contemporary', 0.5007752879124892),
 ('genuine', 0.500462836730444),
 ('frightening', 0.49995595152908684),
 ('plays', 0.49975983848890226),
 ('age', 0.49941323171424595),
 ('position', 0.4989911661189878),
 ('continues', 0.4986303506721724),
 ('roles', 0.4983971655075218),
 ('james', 0.498372162694704),
 ('individuals', 0.4982468415591305),
 ('brought', 0.49783842823917956),
 ('hilarious', 0.4971455198619106),
 ('brutal', 0.49681488669639234),
 ('appropriate', 0.49643688631389105),
 ('dance', 0.4958199831481205),
 ('league', 0.49578774640145024),
 ('answers', 0.49578774640145024),
 ('helping', 0.49578774640145024),
 ('stunts', 0.49561620510246196),
 ('traveling', 0.4953214372300254),
 ('thoroughly', 0.49414593456733524),
 ('depicted', 0.4931706885272699),
 ('differences', 0.49247648509779424),
 ('honor', 0.49247648509779424),
 ('combination', 0.49247648509779424),
 ('fully', 0.4921334907538381),
 ('tracy', 0.49159426183810306),
 ('battles', 0.4914075379088891),
 ('possibility', 0.4911205526866582),
 ('romance', 0.4901589869574316),
 ('initially', 0.49002249613622745),
 ('happy', 0.4898997500608791),
 ('crime', 0.48977221456815834),
 ('singing', 0.4893852925281213),
 ('especially', 0.48901267837860624),
 ('shakespeare', 0.4875479388966451),
 ('hugh', 0.4872951263557966),
 ('detail', 0.4860948425082735),
 ('desperation', 0.4855078157817008),
 ('san', 0.4855078157817008),
 ('julia', 0.4855078157817008),
 ('companion', 0.4855078157817008),
 ('guide', 0.4855078157817008),
 ('strongly', 0.48460242866688824),
 ('necessary', 0.48302334245403883),
 ('humanity', 0.48265474679929443),
 ('drama', 0.48221998493060503),
 ('nonetheless', 0.4818380868927384),
 ('warming', 0.4818380868927384),
 ('cuba', 0.4818380868927384),
 ('intrigue', 0.4818380868927384),
 ('planned', 0.4795730802618863),
 ('pictures', 0.4792993701192168),
 ('broadcast', 0.4784902431230542),
 ('nine', 0.47803580094299974),
 ('settings', 0.47743860773325364),
 ('history', 0.4773296693378085),
 ('ordinary', 0.4772588001269074),
 ('trade', 0.47692407209030935),
 ('official', 0.4760826753221178),
 ('primary', 0.4760826753221178),
 ('episode', 0.4752962026115043),
 ('role', 0.47520268270188676),
 ('spirit', 0.4747769079983932),
 ('grey', 0.4740936144972607),
 ('ways', 0.47323464982718205),
 ('cup', 0.472604410945793),
 ('piano', 0.472604410945793),
 ('familiar', 0.4724161756511195),
 ('sinister', 0.4719857904497268),
 ('reveal', 0.47171449364936496),
 ('max', 0.4715085204251558),
 ('dated', 0.4712164856709448),
 ('vicious', 0.47000362924573563),
 ('losing', 0.47000362924573563),
 ('discovery', 0.47000362924573563),
 ('genuinely', 0.46871413841586385),
 ('hatred', 0.46734051182625186),
 ('mistaken', 0.4670230011075978),
 ('challenge', 0.46608972992459924),
 ('dream', 0.46608972992459924),
 ('crisis', 0.46575733836428446),
 ('photographed', 0.4648885285789651),
 ('machines', 0.4643056081310978),
 ('bird', 0.4643056081310978),
 ('critics', 0.4643056081310978),
 ('born', 0.4641138351896721),
 ('detective', 0.4636633473511525),
 ('higher', 0.46328467899699055),
 ('inevitable', 0.46262352194811296),
 ('remains', 0.46262352194811296),
 ('soviet', 0.4618180446592961),
 ('ryan', 0.461345566502621),
 ('african', 0.46112595521371813),
 ('smaller', 0.46081520319132935),
 ('techniques', 0.46052488529119184),
 ('information', 0.4603417183339986),
 ('deserved', 0.45999798712841444),
 ('cynical', 0.45953232937844013),
 ('lynch', 0.45953232937844013),
 ('tour', 0.45953232937844013),
 ('francisco', 0.45953232937844013),
 ('spielberg', 0.45953232937844013),
 ('struggle', 0.45911782160048453),
 ('language', 0.4590212125771265),
 ('visual', 0.4582351440882285),
 ('warner', 0.45724137763188427),
 ('social', 0.45720078250735313),
 ('reality', 0.45719346885019546),
 ('hidden', 0.4567584024957149),
 ('breaking', 0.4560173872709956),
 ('sometimes', 0.45563021171182794),
 ('modern', 0.45500247579345005),
 ('surfing', 0.4542552722775964),
 ('popular', 0.45410691533051023),
 ('surprised', 0.4534409399850382),
 ('follows', 0.4524536175440835),
 ('keeps', 0.45234869400701483),
 ('john', 0.4520909494482197),
 ('mixed', 0.4519851237430572),
 ('defeat', 0.4519851237430572),
 ('justice', 0.4514272436728002),
 ('treasure', 0.45083371313801535),
 ('presents', 0.44973793178615257),
 ('years', 0.4491919703210497),
 ('chief', 0.4489502200479032),
 ('shadows', 0.44802472252696035),
 ('closely', 0.4470141110210369),
 ('segments', 0.4470141110210369),
 ('lose', 0.446583355037637),
 ('caine', 0.44628710262841953),
 ('caught', 0.4461027538399907),
 ('hamlet', 0.44558510189758965),
 ('chinese', 0.4450742462032102),
 ('welcome', 0.4443805243578379),
 ('birth', 0.4436863209283622),
 ('represents', 0.44320543609101143),
 ('puts', 0.4427910657208508),
 ('closer', 0.44183275227903923),
 ('fame', 0.44183275227903923),
 ('visuals', 0.44183275227903923),
 ('web', 0.44183275227903923),
 ('criminal', 0.4412745608048752),
 ('minor', 0.4409224199448939),
 ('jon', 0.44086703515908027),
 ('liked', 0.4407499151402072),
 ('restaurant', 0.44031183943833246),
 ('de', 0.4398327516123722),
 ('flaws', 0.4398327516123722),
 ('searching', 0.4393666597838457),
 ('rap', 0.4389130421757044),
 ('light', 0.4388443301819989),
 ('elizabeth', 0.43872232986464677),
 ('marry', 0.4386173154250649),
 ('learned', 0.4382549309311553),
 ('controversial', 0.4382549309311553),
 ('oz', 0.4382549309311553),
 ('slowly', 0.4378566038993998),
 ('wayne', 0.43721380642274466),
 ('comedic', 0.43721380642274466),
 ('thrilling', 0.43721380642274466),
 ('bridge', 0.43721380642274466),
 ('married', 0.4365850168219689),
 ('nazi', 0.4361020775700542),
 ('physical', 0.4353180712578455),
 ('murder', 0.4353180712578455),
 ('johnny', 0.43483971678806865),
 ('michelle', 0.4344526449814167),
 ('wallace', 0.4340384805522204),
 ('silent', 0.43395706390247063),
 ('comedies', 0.43395706390247063),
 ('played', 0.43387244114515305),
 ('international', 0.43363598507486073),
 ('vision', 0.4328640822962789),
 ('intelligent', 0.431967048853671),
 ('shop', 0.43078291609245434),
 ('also', 0.4303672020976917),
 ('levels', 0.4302451371066513),
 ('miss', 0.4300642671215322),
 ('revolutionary', 0.4295626596872249),
 ...]
In [17]:
# words most frequently seen in a review with a "NEGATIVE" label
list(reversed(pos_neg_ratios.most_common()))[0:30]

# Note: Above is the code Andrew uses in his solution video, 
#       so we've included it here to avoid confusion.
#       If you explore the documentation for the Counter class, 
#       you will see you could also find the 30 least common
#       words like this: pos_neg_ratios.most_common()[:-31:-1]
Out[17]:
[('boll', -4.969813299576001),
 ('uwe', -4.624972813284271),
 ('seagal', -3.644143560272545),
 ('unwatchable', -3.258096538021482),
 ('stinker', -3.2088254890146994),
 ('mst', -2.9502698994772336),
 ('incoherent', -2.9368917735310576),
 ('unfunny', -2.6922395950755678),
 ('waste', -2.6193845640165536),
 ('blah', -2.5704288232261625),
 ('horrid', -2.4849066497880004),
 ('pointless', -2.4553061800117097),
 ('atrocious', -2.4259083090260445),
 ('redeeming', -2.3682390632154826),
 ('prom', -2.3608540011180215),
 ('drivel', -2.3470368555648795),
 ('lousy', -2.307572634505085),
 ('worst', -2.286987896180378),
 ('laughable', -2.264363880173848),
 ('awful', -2.227194247027435),
 ('poorly', -2.2207550747464135),
 ('wasting', -2.204604684633842),
 ('remotely', -2.1972245773362196),
 ('existent', -2.0794415416798357),
 ('boredom', -1.995100393246085),
 ('miserably', -1.9924301646902063),
 ('sucks', -1.987068221548821),
 ('uninspired', -1.9832976811269336),
 ('lame', -1.981767458946166),
 ('insult', -1.978345424808467)]

Transforming Text into Numbers

In [18]:
review = "The movie was excellent"

Image(filename='sentiment_network_pos.png')
Out[18]:

Creating the Input/Output Data

In [19]:
vocab = set(total_counts.keys())
In [20]:
vocab_size = len(vocab)
print(vocab_size)
74074

Take a look at the following image. It represents the layers of the neural network you'll be building throughout this notebook. layer_0 is the input layer, layer_1 is a hidden layer, and layer_2 is the output layer.

In [1]:
from IPython.display import Image
Image(filename='sentiment_network_2.png')
Out[1]:
In [21]:
layer_0 = np.zeros((1,vocab_size))
In [22]:
layer_0.shape
Out[22]:
(1, 74074)
In [23]:
from IPython.display import Image
Image(filename='sentiment_network.png')
Out[23]:

layer_0 contains one entry for every word in the vocabulary, as shown in the above image. We need to make sure we know the index of each word, so run the following cell to create a lookup table that stores the index of every word.

In [23]:
# Create a dictionary of words in the vocabulary mapped to index positions 
# (to be used in layer_0)
word2index = {}
for i,word in enumerate(vocab):
    word2index[word] = i
    
# display the map of words to indices
word2index
Out[23]:
{'': 0,
 'veiw': 64082,
 'sententious': 1,
 'taxi': 49270,
 'dietrichson': 3,
 'piffle': 24656,
 'blackwoods': 4,
 'damini': 5,
 'tywker': 36870,
 'unstoned': 6,
 'luxemburg': 7,
 'subspecies': 49273,
 'multinationals': 9,
 'gawkers': 10,
 'supernovas': 13,
 'lidia': 66642,
 'ngela': 18,
 'dirtier': 15,
 'identifiable': 16,
 'emery': 17,
 'iiiiii': 19,
 'peyote': 20,
 'jennings': 22,
 'predictable': 23,
 'palpitation': 24,
 'annakin': 25,
 'overdoses': 26,
 'shuts': 29,
 'belie': 12382,
 'bagatelle': 32,
 'combusted': 33,
 'relocates': 34,
 'linoleum': 35,
 'directer': 36,
 'exploitationer': 37,
 'panavision': 40,
 'pantyhose': 39,
 'menzies': 42,
 'elya': 45,
 'jaws': 44,
 'bleeth': 47,
 'talisman': 48,
 'gaijin': 49,
 'nationally': 49278,
 'boys': 50,
 'colwell': 51,
 'showtim': 52,
 'unhittable': 53,
 'homicide': 54,
 'horniness': 55,
 'celi': 62,
 'druthers': 63,
 'hallan': 59,
 'berenger': 61,
 'resell': 65,
 'journal': 66,
 'hustled': 68,
 'chewed': 49280,
 'killian': 17880,
 'visage': 58461,
 'range': 69,
 'fanfavorite': 70,
 'plumped': 12388,
 'tow': 8,
 'dreamland': 72,
 'unsettle': 75,
 'hawks': 74,
 'periodically': 49282,
 'galigula': 76,
 'tuco': 78,
 'ceramics': 24669,
 'montreal': 79,
 'canonized': 80,
 'gautam': 11,
 'pacifist': 84,
 'blustery': 29420,
 'nervousness': 44689,
 'lucienne': 24671,
 'duisburg': 49285,
 'perishing': 85,
 'fended': 86,
 'wallflower': 87,
 'abounding': 88,
 'sunshine': 91,
 'laawaris': 90,
 'hrt': 93,
 'confuddled': 95,
 'breeches': 96,
 'bitching': 97,
 'transatlantic': 104,
 'tingles': 99,
 'prieuve': 101,
 'notions': 102,
 'mckenna': 103,
 'yvaine': 24674,
 'delerue': 106,
 'counseled': 109,
 'shurikens': 110,
 'yoshimura': 112,
 'shove': 113,
 'hammy': 114,
 'galipeau': 115,
 'simplistically': 120,
 'producers': 118,
 'slogan': 36893,
 'buttress': 119,
 'cavities': 121,
 'mcdowel': 31550,
 'unfortanetley': 61712,
 'fizzle': 122,
 'prevail': 12395,
 'carhart': 123,
 'actullly': 124,
 'ladykillers': 125,
 'motor': 126,
 'fugue': 24678,
 'tomorowo': 127,
 'talmud': 49292,
 'taandav': 129,
 'odyssey': 130,
 'downtime': 131,
 'overseen': 52385,
 'morin': 133,
 'defying': 134,
 'transsexual': 28,
 'bearand': 24680,
 'nozzle': 72309,
 'harley': 136,
 'itcan': 137,
 'girly': 138,
 'stabilize': 139,
 'arose': 140,
 'unnamed': 141,
 'scholes': 27,
 'swaggering': 142,
 'unremarkable': 24683,
 'avert': 143,
 'delinquency': 144,
 'crinolines': 145,
 'copolla': 146,
 'sren': 147,
 'redoing': 54380,
 'shakingly': 148,
 'bowties': 151,
 'aa': 150,
 'nightgown': 152,
 'cada': 153,
 'admire': 154,
 'facing': 155,
 'humans': 156,
 'fulfilment': 12404,
 'warwick': 157,
 'phool': 24689,
 'sadism': 158,
 'criticised': 159,
 'prousalis': 167,
 'wtf': 161,
 'lorraina': 163,
 'delve': 165,
 'matt': 168,
 'tko': 169,
 'spraying': 170,
 'congested': 171,
 'moshana': 172,
 'autie': 176,
 'liga': 174,
 'lasting': 175,
 'apposed': 177,
 'survey': 178,
 'queef': 179,
 'gaspingly': 180,
 'salvo': 181,
 'known': 30,
 'use': 182,
 'panaghoy': 183,
 'kanaly': 27680,
 'pasolini': 49301,
 'stats': 185,
 'yecch': 188,
 'wagons': 187,
 'meara': 189,
 'reyes': 192,
 'conti': 193,
 'flapper': 195,
 'lajos': 196,
 'cellular': 70193,
 'marks': 12412,
 'subconsciousness': 197,
 'filicide': 198,
 'imo': 200,
 'horne': 201,
 'perfectionistic': 202,
 'starched': 204,
 'bejeebers': 205,
 'fern': 207,
 'bandit': 208,
 'die': 29749,
 'premiered': 209,
 'zane': 72444,
 'nomolos': 61924,
 'natalia': 210,
 'dins': 211,
 'heritage': 212,
 'ours': 213,
 'ostracization': 36910,
 'oleary': 214,
 'flounce': 215,
 'spoiled': 65900,
 'eponymous': 38,
 'class': 216,
 'produces': 217,
 'speeders': 218,
 'underfoot': 61733,
 'cini': 219,
 'jenuet': 46,
 'labyrinth': 220,
 'cremated': 226,
 'icarus': 225,
 'nastassja': 67167,
 'tak': 43,
 'misquoted': 227,
 'azadi': 230,
 'stills': 229,
 'howard': 50628,
 'restrooms': 232,
 'indefatigable': 233,
 'irreverently': 235,
 'micky': 236,
 'prizefighting': 71253,
 'ankylosaur': 237,
 'romeros': 238,
 'creditsof': 241,
 'hairdewed': 242,
 'disadvantaged': 61737,
 'hominid': 243,
 'arquett': 252,
 'surronding': 245,
 'pleasantness': 255,
 'iago': 247,
 'brash': 248,
 'mohan': 251,
 'boldly': 250,
 'centralized': 253,
 'maneuverability': 254,
 'lauderdale': 258,
 'collectibles': 257,
 'bilal': 259,
 'griffith': 49313,
 'eccentric': 260,
 'sadie': 60083,
 'looted': 262,
 'reclining': 52753,
 'onegin': 61739,
 'surreality': 263,
 'righteous': 264,
 'kayru': 265,
 'friers': 266,
 'chaliapin': 267,
 'tinhorns': 273,
 'treasury': 270,
 'stever': 271,
 'moderne': 274,
 'peary': 36528,
 'deduces': 275,
 'shariff': 276,
 'cums': 277,
 'souler': 278,
 'summarily': 279,
 'quoting': 280,
 'sandrine': 281,
 'wilnona': 282,
 'ethiopia': 283,
 'property': 73218,
 'mastermind': 284,
 'ahem': 67320,
 'teletype': 285,
 'lomax': 286,
 'unsetteling': 24699,
 'floppy': 56,
 'martino': 288,
 'hofman': 24700,
 'sods': 290,
 'alter': 32926,
 'opportunities': 291,
 'unwrapping': 292,
 'unsteady': 24701,
 'scarlatina': 57,
 'fatherlands': 293,
 'feces': 61752,
 'dkman': 294,
 'myiazaki': 295,
 'carle': 296,
 'invaders': 297,
 'apology': 298,
 'roses': 299,
 'disturbia': 302,
 'impersonalized': 303,
 'lehrman': 71469,
 'kose': 307,
 'gaggling': 308,
 'highly': 306,
 'florida': 311,
 'workaholics': 312,
 'spurist': 314,
 'muffling': 315,
 'outs': 56825,
 'hurriedly': 316,
 'snakes': 317,
 'nox': 66497,
 'conversed': 318,
 'jang': 319,
 'recycled': 52004,
 'expressions': 320,
 'vistor': 322,
 'kagemusha': 36927,
 'corporate': 323,
 'prollific': 329,
 'donna': 325,
 'wires': 49324,
 'points': 327,
 'montage': 25309,
 'samir': 330,
 'cannavale': 331,
 'heterosexuality': 332,
 'pox': 338,
 'velda': 334,
 'dreadfull': 335,
 'interviewed': 336,
 'scifi': 24711,
 'kibbutznikim': 340,
 'undertones': 341,
 'scarlatti': 342,
 'advice': 67,
 'bravest': 344,
 'liberace': 345,
 'racer': 12429,
 'eagerly': 346,
 'pontificate': 350,
 'uhs': 348,
 'hutchins': 51925,
 'standalone': 33755,
 'kidswell': 352,
 'grounded': 61758,
 'gurus': 353,
 'sanitizing': 355,
 'landesberg': 65924,
 'outshine': 358,
 'closer': 359,
 'paulin': 61761,
 'thurl': 365,
 'cinematographicly': 362,
 'ignominiously': 36933,
 'punches': 364,
 'yella': 24717,
 'practically': 366,
 'carving': 368,
 'tsotg': 369,
 'monthy': 370,
 'irrefutable': 24718,
 'pepe': 372,
 'mistress': 73235,
 'andrei': 373,
 'despaired': 374,
 'settlements': 375,
 'princeton': 12436,
 'gushes': 376,
 'apartment': 377,
 'commences': 71,
 'madigan': 57137,
 'modulate': 379,
 'parr': 380,
 'watchably': 381,
 'leidner': 382,
 'amiche': 383,
 'bonet': 384,
 'cokehead': 385,
 'preposterously': 386,
 'menelaus': 387,
 'slothy': 70271,
 'conelly': 388,
 'salaryman': 389,
 'fetchingly': 391,
 'aces': 49335,
 'ebonic': 47157,
 'js': 392,
 'dattilo': 393,
 'couco': 49337,
 'leander': 394,
 'slow': 395,
 'crabby': 397,
 'legitimates': 77,
 'grabber': 398,
 'wiki': 399,
 'proletariat': 49339,
 'rock': 400,
 'pinkins': 401,
 'river': 67497,
 'gores': 24727,
 'syringes': 402,
 'loudhailer': 403,
 'calvero': 406,
 'strumming': 407,
 'homely': 408,
 'goldfinger': 409,
 'lifewell': 410,
 'iroquois': 12444,
 'tsunehiko': 411,
 'leper': 412,
 'newscast': 413,
 'takaishvili': 414,
 'tweety': 49345,
 'travelling': 24731,
 'kiesser': 415,
 'straws': 416,
 'cocoa': 417,
 'heathrow': 419,
 'recollecting': 81,
 'urges': 420,
 'coda': 421,
 'considine': 422,
 'eensy': 423,
 'ounces': 424,
 'camaro': 425,
 'conservatively': 83,
 'nintendo': 426,
 'groener': 427,
 'jayden': 49347,
 'gazarra': 63820,
 'hollanders': 24735,
 'schulberg': 64861,
 'hartley': 428,
 'bardot': 429,
 'marianbad': 431,
 'mais': 49351,
 'contours': 63616,
 'speedily': 433,
 'artificially': 49352,
 'demonstrator': 435,
 'watson': 436,
 'cleancut': 25863,
 'gaionsbourg': 441,
 'unmasking': 438,
 'skyrocket': 439,
 'operatora': 440,
 'subtracts': 12449,
 'thesis': 442,
 'unferth': 36947,
 'grieco': 443,
 'coolne': 445,
 'nada': 446,
 'mega': 447,
 'scrawny': 448,
 'amell': 449,
 'lawrence': 450,
 'height': 453,
 'scuzzy': 454,
 'finder': 455,
 'mentally': 456,
 'tashi': 458,
 'tarnish': 459,
 'sycophantic': 49357,
 'haystack': 460,
 'interwhined': 461,
 'culprits': 462,
 'xmas': 73800,
 'euthanizes': 464,
 'megan': 465,
 'indicate': 466,
 'woodcraft': 467,
 'mockumentary': 468,
 'mckim': 65239,
 'marmont': 36953,
 'libraries': 61784,
 'hubbard': 470,
 'facials': 471,
 'relaxers': 472,
 'leer': 474,
 'yound': 475,
 'salacious': 476,
 'bromwich': 477,
 'tourneur': 479,
 'socialists': 481,
 'floored': 482,
 'gnashing': 486,
 'pingo': 487,
 'bland': 89,
 'rb': 488,
 'bleakest': 489,
 'swerve': 63215,
 'juggles': 92,
 'rewind': 490,
 'afterwards': 54508,
 'compilation': 491,
 'latrina': 494,
 'trina': 493,
 'stereotypes': 495,
 'decieve': 36956,
 'sliminess': 496,
 'whatchoo': 500,
 'skeptic': 499,
 'nilsen': 501,
 'tropical': 502,
 'puccini': 503,
 'deft': 505,
 'schlubs': 506,
 'reprised': 94,
 'crucify': 508,
 'kailin': 509,
 'terrorised': 68313,
 'mgr': 510,
 'gidwani': 511,
 'spielbergian': 513,
 'wright': 71343,
 'janitors': 515,
 'malick': 12460,
 'scarlet': 72204,
 'silla': 517,
 'puzzlers': 519,
 'germinates': 100,
 'dormants': 520,
 'caetano': 521,
 'inarticulate': 527,
 'thumbnail': 529,
 'decisionsin': 524,
 'gauntlet': 525,
 'notethe': 531,
 'semi': 98,
 'deano': 532,
 'saunders': 61794,
 'scrimmages': 530,
 'fenner': 61795,
 'panther': 56003,
 'law': 534,
 'blazing': 536,
 'filmirage': 537,
 'mulholland': 538,
 'som': 61797,
 'trite': 539,
 'evidently': 49364,
 'worshiper': 105,
 'mansquito': 541,
 'expeditiously': 542,
 'denoument': 544,
 'selfishly': 545,
 'awaaaaay': 546,
 'slickest': 547,
 'cadfile': 548,
 'scarlett': 549,
 'slowmotion': 550,
 'deteriorates': 551,
 'wushu': 555,
 'pestilence': 553,
 'flanked': 554,
 'encyclopedia': 49368,
 'witted': 556,
 'eclipse': 24754,
 'items': 557,
 'nekhron': 57729,
 'looming': 559,
 'fiending': 560,
 'tumors': 561,
 'notably': 562,
 'jeering': 37226,
 'planetary': 563,
 'richman': 564,
 'rossellini': 67526,
 'cagliostro': 566,
 'burkhalter': 12472,
 'joh': 567,
 'stinkbug': 12473,
 'vegeburgers': 569,
 'prez': 57834,
 'perk': 570,
 'spanglish': 571,
 'foods': 572,
 'religiosity': 574,
 'liman': 111,
 'entitlement': 575,
 'boz': 576,
 'missoula': 60951,
 'rating': 70278,
 'dispatched': 578,
 'authur': 579,
 'shiranui': 580,
 'aides': 581,
 'tycoons': 582,
 'allowing': 583,
 'whispering': 584,
 'nfa': 586,
 'kamikaze': 587,
 'goodmans': 12479,
 'tweaking': 58664,
 'metropolis': 51914,
 'romantic': 588,
 'stymie': 52651,
 'actingjob': 590,
 'goonies': 591,
 'faceless': 593,
 'indications': 594,
 'darkest': 596,
 'fowzi': 597,
 'dond': 598,
 'flirts': 599,
 'modulation': 24762,
 'nastiness': 66230,
 'augusten': 600,
 'outstretched': 601,
 'unnatural': 602,
 'tolstoy': 12482,
 'britishness': 603,
 'inquiring': 604,
 'operates': 605,
 'brenna': 606,
 'politicization': 612,
 'bullst': 608,
 'whoring': 610,
 'chase': 613,
 'gangbangers': 615,
 'descending': 618,
 'prickly': 619,
 'candoli': 620,
 'capri': 623,
 'predestined': 624,
 'diggers': 117,
 'flava': 626,
 'luckless': 49379,
 'nyatta': 628,
 'clampett': 629,
 'labours': 630,
 'ethical': 631,
 'alta': 632,
 'berg': 633,
 'cluelessly': 634,
 'bioterrorism': 635,
 'charlene': 636,
 'sbs': 638,
 'constituted': 640,
 'colbet': 64154,
 'risqu': 641,
 'monson': 642,
 'stephanie': 643,
 'give': 61811,
 'goivernment': 646,
 'hegalhuzen': 645,
 'hoast': 647,
 'domestic': 650,
 'versatility': 651,
 'depravation': 652,
 'municipal': 49382,
 'wynorski': 654,
 'liberation': 655,
 'foreseen': 656,
 'kotch': 658,
 'celestine': 36977,
 'guests': 659,
 'pooped': 660,
 'jhutsi': 662,
 'kristensen': 663,
 'tesc': 664,
 'hiya': 665,
 'jeopardised': 49384,
 'ner': 53904,
 'franker': 672,
 'granting': 667,
 'mock': 668,
 'carload': 671,
 'ineluctably': 12493,
 'elders': 673,
 'piere': 679,
 'nashville': 675,
 'bullsh': 676,
 'unevenly': 68438,
 'tyrant': 678,
 'unconstitutional': 24771,
 'legendary': 680,
 'mosque': 681,
 'kafka': 36984,
 'piercing': 73724,
 'presumptive': 682,
 'dukakas': 683,
 'emancipation': 684,
 'buchanan': 685,
 'portugal': 54433,
 'wiliam': 686,
 'discovered': 687,
 'bullion': 12496,
 'mcenroe': 688,
 'roughs': 689,
 'sever': 690,
 'bludge': 691,
 'arshad': 692,
 'platitude': 693,
 'probibly': 695,
 'alecky': 59719,
 'diferent': 128,
 'politic': 696,
 'gertrude': 697,
 'misting': 698,
 'unsuprised': 699,
 'eventually': 700,
 'conscript': 61231,
 'fdr': 50385,
 'artful': 702,
 'bakery': 703,
 'obtuse': 704,
 'ozma': 705,
 'possibly': 706,
 'forums': 707,
 'cosima': 708,
 'vo': 61821,
 'merr': 25554,
 'article': 49390,
 'apologize': 712,
 'wisest': 132,
 'presents': 714,
 'potch': 715,
 'scumbags': 716,
 'culpas': 717,
 'usc': 718,
 'blight': 719,
 'clued': 12499,
 'eventy': 720,
 'gazelle': 721,
 'turpin': 724,
 'homestead': 723,
 'modesty': 725,
 'rebel': 727,
 'style': 728,
 'totemic': 49394,
 'jabber': 730,
 'upswept': 731,
 'dampening': 732,
 'blinkered': 733,
 'drift': 734,
 'overpowered': 735,
 'bleaker': 65475,
 'munnera': 736,
 'tobruk': 737,
 'hately': 738,
 'kitted': 12504,
 'eradication': 739,
 'warrants': 740,
 'washoe': 741,
 'kusugi': 743,
 'nickeloden': 71644,
 'anothwer': 744,
 'flambards': 745,
 'farther': 746,
 'roththe': 747,
 'martine': 56442,
 'commishioner': 748,
 'raider': 749,
 'tree': 73655,
 'disregards': 751,
 'plainclothes': 135,
 'brambell': 753,
 'mod': 55880,
 'novice': 754,
 'ostensibly': 756,
 'aged': 758,
 'saying': 759,
 'vault': 61834,
 'optical': 51985,
 'nestled': 49400,
 'tired': 761,
 'swings': 762,
 'reliability': 49403,
 'gq': 72575,
 'makeout': 24784,
 'repopulate': 49402,
 'back': 764,
 'wistfulness': 765,
 'findlay': 771,
 'costars': 767,
 'feats': 768,
 'prudence': 773,
 'carrys': 774,
 'bank': 772,
 'thriteen': 777,
 'physicist': 778,
 'happens': 779,
 'grieve': 780,
 'vindicate': 781,
 'galicia': 782,
 'debutants': 783,
 'sabra': 36998,
 'alls': 788,
 'category': 785,
 'minded': 789,
 'divali': 791,
 'satirically': 790,
 'malformations': 24788,
 'hassling': 793,
 'schygula': 794,
 'put': 6404,
 'stefans': 795,
 'examplewhen': 49406,
 'board': 73732,
 'gunned': 796,
 'statistical': 797,
 'peva': 54742,
 'storage': 798,
 'hoon': 799,
 'adored': 800,
 'zither': 804,
 'metrosexual': 802,
 'brocoli': 805,
 'broek': 46469,
 'farmworker': 806,
 'portabellow': 807,
 'tigress': 808,
 'danyael': 809,
 'coeds': 810,
 'understudies': 811,
 'carts': 812,
 'tacones': 814,
 'binouche': 71799,
 'leitmotif': 815,
 'meditations': 816,
 'denier': 817,
 'fatso': 818,
 'geisha': 819,
 'idealists': 820,
 'firefighters': 821,
 'noriyuki': 822,
 'blasco': 825,
 'kiriya': 824,
 'reichdeutch': 826,
 'disjointing': 827,
 'sati': 61845,
 'whored': 829,
 'peebles': 52166,
 'bezzerides': 831,
 'eames': 833,
 'stamos': 838,
 'klaymation': 836,
 'veterans': 835,
 'fooling': 839,
 'jorma': 840,
 'appliances': 841,
 'deuces': 842,
 'bellerophon': 843,
 'moovies': 61849,
 'abolitionism': 844,
 'bombast': 846,
 'shuddering': 847,
 'wooofff': 848,
 'swagger': 849,
 'turgenev': 850,
 'glitches': 851,
 'disappearing': 149,
 'nolte': 853,
 'upsets': 854,
 'uruguay': 857,
 'rajasekhar': 856,
 'nicknamed': 61852,
 'golino': 858,
 'aggrandizement': 859,
 'vertical': 860,
 'frederik': 865,
 'numbered': 862,
 'transplant': 863,
 'contested': 864,
 'fetchit': 866,
 'drawers': 49421,
 'rotoscoping': 64875,
 'secretsdirector': 869,
 'trillions': 868,
 'pushups': 870,
 'latex': 871,
 'coal': 872,
 'prosthetic': 873,
 'tres': 37012,
 'tribulation': 24800,
 'tantrapur': 62151,
 'pettiness': 874,
 'cleanse': 875,
 'opine': 12527,
 'asininity': 877,
 'nullifying': 878,
 'siamese': 879,
 'tonnerre': 884,
 'aggresive': 882,
 'lbeck': 883,
 'hillary': 885,
 'chahracters': 886,
 'eartha': 888,
 'amilee': 890,
 'ethier': 61856,
 'rostova': 891,
 'culminated': 52331,
 'hyperbole': 892,
 'scottland': 61858,
 'debate': 893,
 'reload': 899,
 'uniqueness': 895,
 'minette': 900,
 'houseboat': 897,
 'developer': 898,
 'armando': 907,
 'overshadows': 901,
 'lothar': 904,
 'mixer': 905,
 'toilet': 906,
 'sweating': 12538,
 'warsaw': 31312,
 'winterwonder': 908,
 'remember': 909,
 'sued': 910,
 'prayer': 911,
 'fantasize': 72753,
 'aggravation': 912,
 'flamethrower': 913,
 'cellmates': 162,
 'cabinet': 914,
 'cadillacs': 915,
 'quits': 160,
 'meatball': 918,
 'weber': 919,
 'bills': 164,
 'maleficent': 920,
 'orked': 921,
 'uncannily': 922,
 'neuroinfectious': 923,
 'kabuto': 925,
 'bankable': 926,
 'unconditionally': 24809,
 'julietta': 927,
 'columbusland': 928,
 'stride': 929,
 'sights': 930,
 'viable': 931,
 'nu': 932,
 'scripture': 933,
 'moral': 934,
 'proved': 935,
 'openers': 37022,
 'esperando': 936,
 'darkish': 937,
 'childhoods': 12548,
 'seeds': 939,
 'sabc': 65318,
 'jerkoff': 12547,
 'crud': 941,
 'orbison': 942,
 'nicky': 943,
 'unpleasantness': 944,
 'horniphobia': 49434,
 'bargepoles': 946,
 'pearlie': 951,
 'bald': 948,
 'berth': 950,
 'utilises': 952,
 'wax': 61870,
 'prostitute': 954,
 'wildcat': 49436,
 'mailman': 956,
 'negative': 957,
 'voyeurs': 61874,
 'genii': 958,
 'korzeniowsky': 959,
 'whackees': 960,
 'longed': 962,
 'ceylon': 963,
 'sacrine': 964,
 'blackmailed': 966,
 'distribution': 967,
 'butthead': 968,
 'squandered': 969,
 'reprimanded': 970,
 'kafkanian': 979,
 'linebacker': 974,
 'joliet': 64621,
 'treadmill': 976,
 'guided': 977,
 'turiquistan': 24816,
 'hortense': 29368,
 'cassady': 981,
 'furlough': 61876,
 'withnail': 982,
 'lundegaard': 173,
 'burkhardt': 24818,
 'ubiquitous': 49442,
 'surveyed': 983,
 'kamp': 984,
 'fascinated': 985,
 'institutions': 986,
 'bf': 987,
 'afforded': 61880,
 'farhan': 988,
 'morgus': 989,
 'sw': 992,
 'chlorians': 993,
 'cockiness': 995,
 'praskins': 997,
 'giants': 998,
 'raging': 999,
 'jamestown': 1000,
 'intently': 37033,
 'roman': 1002,
 ...}

TODO: Complete the implementation of update_input_layer. It should count how many times each word is used in the given review, and then store those counts at the appropriate indices inside layer_0.

In [24]:
def update_input_layer(review):
    """ Modify the global layer_0 to represent the vector form of review.
    The element at a given index of layer_0 should represent
    how many times the given word occurs in the review.
    Args:
        review(string) - the string of the review
    Returns:
        None
    """
     
    global layer_0
    
    # clear out previous state, reset the layer to be all 0s
    layer_0 *= 0
    
    # count how many times each word is used in the given review and store the results in layer_0 
    for word in review.split(" "):
        layer_0[0][word2index[word]] += 1

Run the following cell to test updating the input layer with the first review. The indices assigned may not be the same as in the solution, but hopefully you'll see some non-zero values in layer_0.

In [25]:
update_input_layer(reviews[0])
layer_0
Out[25]:
array([[18.,  0.,  0., ...,  0.,  0.,  0.]])

TODO: Complete the implementation of get_target_for_labels. It should return 0 or 1, depending on whether the given label is NEGATIVE or POSITIVE, respectively.

In [27]:
def get_target_for_label(label):
    """Convert a label to `0` or `1`.
    Args:
        label(string) - Either "POSITIVE" or "NEGATIVE".
    Returns:
        `0` or `1`.
    """
    if(label == 'POSITIVE'):
        return 1
    else:
        return 0

Run the following two cells. They should print out'POSITIVE' and 1, respectively.

In [28]:
labels[0]
Out[28]:
'POSITIVE'
In [29]:
get_target_for_label(labels[0])
Out[29]:
1

Run the following two cells. They should print out 'NEGATIVE' and 0, respectively.

In [30]:
labels[1]
Out[30]:
'NEGATIVE'
In [31]:
get_target_for_label(labels[1])
Out[31]:
0

Building a Neural Network

TODO: We've included the framework of a class called SentimentNetork. Implement all of the items marked TODO in the code. These include doing the following:

  • Create a basic neural network much like the networks you've seen in earlier lessons and in Project 1, with an input layer, a hidden layer, and an output layer.
  • Do not add a non-linearity in the hidden layer. That is, do not use an activation function when calculating the hidden layer outputs.
  • Re-use the code from earlier in this notebook to create the training data (see TODOs in the code)
  • Implement the pre_process_data function to create the vocabulary for our training data generating functions
  • Ensure train trains over the entire corpus
In [32]:
import time
import sys
import numpy as np

# Encapsulate our neural network in a class
class SentimentNetwork:
    def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1):
        """Create a SentimenNetwork with the given settings
        Args:
            reviews(list) - List of reviews used for training
            labels(list) - List of POSITIVE/NEGATIVE labels associated with the given reviews
            hidden_nodes(int) - Number of nodes to create in the hidden layer
            learning_rate(float) - Learning rate to use while training
        
        """
        # Assign a seed to our random number generator to ensure we get
        # reproducable results during development 
        np.random.seed(1)

        # process the reviews and their associated labels so that everything
        # is ready for training
        self.pre_process_data(reviews, labels)
        
        # Build the network to have the number of hidden nodes and the learning rate that
        # were passed into this initializer. Make the same number of input nodes as
        # there are vocabulary words and create a single output node.
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

    def pre_process_data(self, reviews, labels):
        
        # populate review_vocab with all of the words in the given reviews
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)

        # Convert the vocabulary set to a list so we can access words via indices
        self.review_vocab = list(review_vocab)
        
        # populate label_vocab with all of the words in the given labels.
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        # Convert the label vocabulary set to a list so we can access labels via indices
        self.label_vocab = list(label_vocab)
        
        # Store the sizes of the review and label vocabularies.
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        # Create a dictionary of words in the vocabulary mapped to index positions
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        # Create a dictionary of labels mapped to index positions
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i
        
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Store the learning rate
        self.learning_rate = learning_rate

        # Initialize weights

        # These are the weights between the input layer and the hidden layer.
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
    
        # These are the weights between the hidden layer and the output layer.
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        # The input layer, a two-dimensional matrix with shape 1 x input_nodes
        self.layer_0 = np.zeros((1,input_nodes))
    
    def update_input_layer(self,review):

        # clear out previous state, reset the layer to be all 0s
        self.layer_0 *= 0
        
        for word in review.split(" "):
            # NOTE: This if-check was not in the version of this method created in Project 2,
            #       and it appears in Andrew's Project 3 solution without explanation. 
            #       It simply ensures the word is actually a key in word2index before
            #       accessing it, which is important because accessing an invalid key
            #       with raise an exception in Python. This allows us to ignore unknown
            #       words encountered in new reviews.
            if(word in self.word2index.keys()):
                self.layer_0[0][self.word2index[word]] += 1
                
    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    def train(self, training_reviews, training_labels):
        
        # make sure out we have a matching number of reviews and labels
        assert(len(training_reviews) == len(training_labels))
        
        # Keep track of correct predictions to display accuracy during training 
        correct_so_far = 0

        # Remember when we started for printing time statistics
        start = time.time()
        
        # loop through all the given reviews and run a forward and backward pass,
        # updating weights for every item
        for i in range(len(training_reviews)):
            
            # Get the next review and its correct label
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            # Input Layer
            self.update_input_layer(review)

            # Hidden layer
            layer_1 = self.layer_0.dot(self.weights_0_1)

            # Output layer
            layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))
            
            #### Implement the backward pass here ####
            ### Backward pass ###

            # Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2) #1*1

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer (1*10)
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            self.weights_1_2 -= layer_1.T.dot(layer_2_delta) * self.learning_rate # (10*1) update hidden-to-output weights with gradient descent step
            self.weights_0_1 -= self.layer_0.T.dot(layer_1_delta) * self.learning_rate # (n*10)update input-to-hidden weights with gradient descent step

            # Keep track of correct predictions.
            if(layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            elif(layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the training process. 
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        """
        Attempts to predict the labels for the given testing_reviews,
        and uses the test_labels to calculate the accuracy of those predictions.
        """
        
        # keep track of how many correct predictions we make
        correct = 0

        # we'll time how many predictions per second we make
        start = time.time()

        # Loop through each of the given reviews and call run to predict
        # its label. 
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        """
        Returns a POSITIVE or NEGATIVE prediction for the given review.
        """
        # Run a forward pass through the network, like in the "train" function.
        
        # Input Layer
        self.update_input_layer(review.lower())

        # Hidden layer
        layer_1 = self.layer_0.dot(self.weights_0_1)

        # Output layer
        layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))
        
        # Return POSITIVE for values above greater-than-or-equal-to 0.5 in the output layer;
        # return NEGATIVE for other values
        if(layer_2[0] >= 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"
        

Run the following cell to create a SentimentNetwork that will train on all but the last 1000 reviews (we're saving those for testing). Here we use a learning rate of 0.1.

In [33]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.1)

Run the following cell to test the network's performance against the last 1000 reviews (the ones we held out from our training set).

We have not trained the model yet, so the results should be about 50% as it will just be guessing and there are only two possible values to choose from.

In [34]:
mlp.test(reviews[-1000:],labels[-1000:])
Progress:99.9% Speed(reviews/sec):1065. #Correct:500 #Tested:1000 Testing Accuracy:50.0%

Run the following cell to actually train the network. During training, it will display the model's accuracy repeatedly as it trains so you can see how well it's doing.

In [35]:
mlp.train(reviews[:-1000],labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):227.4 #Correct:1251 #Trained:2501 Training Accuracy:50.0%
Progress:20.8% Speed(reviews/sec):223.8 #Correct:2501 #Trained:5001 Training Accuracy:50.0%
Progress:31.2% Speed(reviews/sec):215.9 #Correct:3751 #Trained:7501 Training Accuracy:50.0%
Progress:41.6% Speed(reviews/sec):219.7 #Correct:5001 #Trained:10001 Training Accuracy:50.0%
Progress:52.0% Speed(reviews/sec):218.9 #Correct:6251 #Trained:12501 Training Accuracy:50.0%
Progress:62.5% Speed(reviews/sec):218.2 #Correct:7501 #Trained:15001 Training Accuracy:50.0%
Progress:72.9% Speed(reviews/sec):218.8 #Correct:8751 #Trained:17501 Training Accuracy:50.0%
Progress:83.3% Speed(reviews/sec):219.6 #Correct:10001 #Trained:20001 Training Accuracy:50.0%
Progress:93.7% Speed(reviews/sec):219.2 #Correct:11251 #Trained:22501 Training Accuracy:50.0%
Progress:99.9% Speed(reviews/sec):219.0 #Correct:12000 #Trained:24000 Training Accuracy:50.0%

That most likely didn't train very well. Part of the reason may be because the learning rate is too high. Run the following cell to recreate the network with a smaller learning rate, 0.01, and then train the new network.

In [36]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.01)
mlp.train(reviews[:-1000],labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):168.5 #Correct:1248 #Trained:2501 Training Accuracy:49.9%
Progress:20.8% Speed(reviews/sec):165.7 #Correct:2498 #Trained:5001 Training Accuracy:49.9%
Progress:31.2% Speed(reviews/sec):164.7 #Correct:3748 #Trained:7501 Training Accuracy:49.9%
Progress:41.6% Speed(reviews/sec):163.9 #Correct:4998 #Trained:10001 Training Accuracy:49.9%
Progress:52.0% Speed(reviews/sec):163.4 #Correct:6248 #Trained:12501 Training Accuracy:49.9%
Progress:62.5% Speed(reviews/sec):162.9 #Correct:7491 #Trained:15001 Training Accuracy:49.9%
Progress:72.9% Speed(reviews/sec):162.1 #Correct:8741 #Trained:17501 Training Accuracy:49.9%
Progress:83.3% Speed(reviews/sec):161.8 #Correct:9991 #Trained:20001 Training Accuracy:49.9%
Progress:93.7% Speed(reviews/sec):161.4 #Correct:11241 #Trained:22501 Training Accuracy:49.9%
Progress:99.9% Speed(reviews/sec):161.3 #Correct:11990 #Trained:24000 Training Accuracy:49.9%

That probably wasn't much different. Run the following cell to recreate the network one more time with an even smaller learning rate, 0.001, and then train the new network.

In [37]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.001)
mlp.train(reviews[:-1000],labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):170.6 #Correct:1256 #Trained:2501 Training Accuracy:50.2%
Progress:20.8% Speed(reviews/sec):165.1 #Correct:2639 #Trained:5001 Training Accuracy:52.7%
Progress:31.2% Speed(reviews/sec):164.5 #Correct:4110 #Trained:7501 Training Accuracy:54.7%
Progress:41.6% Speed(reviews/sec):162.3 #Correct:5674 #Trained:10001 Training Accuracy:56.7%
Progress:52.0% Speed(reviews/sec):161.7 #Correct:7251 #Trained:12501 Training Accuracy:58.0%
Progress:62.5% Speed(reviews/sec):160.9 #Correct:8872 #Trained:15001 Training Accuracy:59.1%
Progress:72.9% Speed(reviews/sec):160.3 #Correct:10509 #Trained:17501 Training Accuracy:60.0%
Progress:83.3% Speed(reviews/sec):160.2 #Correct:12218 #Trained:20001 Training Accuracy:61.0%
Progress:93.7% Speed(reviews/sec):160.1 #Correct:13868 #Trained:22501 Training Accuracy:61.6%
Progress:99.9% Speed(reviews/sec):160.2 #Correct:14942 #Trained:24000 Training Accuracy:62.2%

With a learning rate of 0.001, the network should finall have started to improve during training. It's still not very good, but it shows that this solution has potential. We will improve it in the next lesson.

Understanding Neural Noise

In [38]:
from IPython.display import Image
Image(filename='sentiment_network.png')
Out[38]:
In [39]:
def update_input_layer(review):
    
    global layer_0
    
    # clear out previous state, reset the layer to be all 0s
    layer_0 *= 0
    for word in review.split(" "):
        layer_0[0][word2index[word]] += 1

update_input_layer(reviews[0])
In [40]:
layer_0
Out[40]:
array([[ 18.,   0.,   0., ...,   0.,   0.,   0.]])
In [41]:
review_counter = Counter()
In [42]:
for word in reviews[0].split(" "):
    review_counter[word] += 1
In [43]:
review_counter.most_common()
Out[43]:
[('.', 27),
 ('', 18),
 ('the', 9),
 ('to', 6),
 ('high', 5),
 ('i', 5),
 ('bromwell', 4),
 ('is', 4),
 ('a', 4),
 ('teachers', 4),
 ('that', 4),
 ('of', 4),
 ('it', 2),
 ('at', 2),
 ('as', 2),
 ('school', 2),
 ('my', 2),
 ('in', 2),
 ('me', 2),
 ('students', 2),
 ('their', 2),
 ('student', 2),
 ('cartoon', 1),
 ('comedy', 1),
 ('ran', 1),
 ('same', 1),
 ('time', 1),
 ('some', 1),
 ('other', 1),
 ('programs', 1),
 ('about', 1),
 ('life', 1),
 ('such', 1),
 ('years', 1),
 ('teaching', 1),
 ('profession', 1),
 ('lead', 1),
 ('believe', 1),
 ('s', 1),
 ('satire', 1),
 ('much', 1),
 ('closer', 1),
 ('reality', 1),
 ('than', 1),
 ('scramble', 1),
 ('survive', 1),
 ('financially', 1),
 ('insightful', 1),
 ('who', 1),
 ('can', 1),
 ('see', 1),
 ('right', 1),
 ('through', 1),
 ('pathetic', 1),
 ('pomp', 1),
 ('pettiness', 1),
 ('whole', 1),
 ('situation', 1),
 ('all', 1),
 ('remind', 1),
 ('schools', 1),
 ('knew', 1),
 ('and', 1),
 ('when', 1),
 ('saw', 1),
 ('episode', 1),
 ('which', 1),
 ('repeatedly', 1),
 ('tried', 1),
 ('burn', 1),
 ('down', 1),
 ('immediately', 1),
 ('recalled', 1),
 ('classic', 1),
 ('line', 1),
 ('inspector', 1),
 ('m', 1),
 ('here', 1),
 ('sack', 1),
 ('one', 1),
 ('your', 1),
 ('welcome', 1),
 ('expect', 1),
 ('many', 1),
 ('adults', 1),
 ('age', 1),
 ('think', 1),
 ('far', 1),
 ('fetched', 1),
 ('what', 1),
 ('pity', 1),
 ('isn', 1),
 ('t', 1)]

Reducing Noise in Our Input Data

In [44]:
import time
import sys
import numpy as np

# Encapsulate our neural network in a class
class SentimentNetwork:
    def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1):
        """Create a SentimenNetwork with the given settings
        Args:
            reviews(list) - List of reviews used for training
            labels(list) - List of POSITIVE/NEGATIVE labels associated with the given reviews
            hidden_nodes(int) - Number of nodes to create in the hidden layer
            learning_rate(float) - Learning rate to use while training
        
        """
        # Assign a seed to our random number generator to ensure we get
        # reproducable results during development 
        np.random.seed(1)

        # process the reviews and their associated labels so that everything
        # is ready for training
        self.pre_process_data(reviews, labels)
        
        # Build the network to have the number of hidden nodes and the learning rate that
        # were passed into this initializer. Make the same number of input nodes as
        # there are vocabulary words and create a single output node.
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

    def pre_process_data(self, reviews, labels):
        
        # populate review_vocab with all of the words in the given reviews
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)

        # Convert the vocabulary set to a list so we can access words via indices
        self.review_vocab = list(review_vocab)
        
        # populate label_vocab with all of the words in the given labels.
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        # Convert the label vocabulary set to a list so we can access labels via indices
        self.label_vocab = list(label_vocab)
        
        # Store the sizes of the review and label vocabularies.
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        # Create a dictionary of words in the vocabulary mapped to index positions
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        # Create a dictionary of labels mapped to index positions
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i
        
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Store the learning rate
        self.learning_rate = learning_rate

        # Initialize weights

        # These are the weights between the input layer and the hidden layer.
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
    
        # These are the weights between the hidden layer and the output layer.
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        # The input layer, a two-dimensional matrix with shape 1 x input_nodes
        self.layer_0 = np.zeros((1,input_nodes))
    
        
    def update_input_layer(self,review):

        # clear out previous state, reset the layer to be all 0s
        self.layer_0 *= 0
        
        for word in review.split(" "):
            # NOTE: This if-check was not in the version of this method created in Project 2,
            #       and it appears in Andrew's Project 3 solution without explanation. 
            #       It simply ensures the word is actually a key in word2index before
            #       accessing it, which is important because accessing an invalid key
            #       with raise an exception in Python. This allows us to ignore unknown
            #       words encountered in new reviews.
            if(word in self.word2index.keys()):
                ## New for Project 4: changed to set to 1 instead of add 1
                self.layer_0[0][self.word2index[word]] = 1
                
    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    def train(self, training_reviews, training_labels):
        
        # make sure out we have a matching number of reviews and labels
        assert(len(training_reviews) == len(training_labels))
        
        # Keep track of correct predictions to display accuracy during training 
        correct_so_far = 0

        # Remember when we started for printing time statistics
        start = time.time()
        
        # loop through all the given reviews and run a forward and backward pass,
        # updating weights for every item
        for i in range(len(training_reviews)):
            
            # Get the next review and its correct label
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            # Input Layer
            self.update_input_layer(review)

            # Hidden layer
            layer_1 = self.layer_0.dot(self.weights_0_1)

            # Output layer
            layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))
            
            #### Implement the backward pass here ####
            ### Backward pass ###

            # Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            self.weights_1_2 -= layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            self.weights_0_1 -= self.layer_0.T.dot(layer_1_delta) * self.learning_rate # update input-to-hidden weights with gradient descent step

            # Keep track of correct predictions.
            if(layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            elif(layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the training process. 
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        """
        Attempts to predict the labels for the given testing_reviews,
        and uses the test_labels to calculate the accuracy of those predictions.
        """
        
        # keep track of how many correct predictions we make
        correct = 0

        # we'll time how many predictions per second we make
        start = time.time()

        # Loop through each of the given reviews and call run to predict
        # its label. 
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        """
        Returns a POSITIVE or NEGATIVE prediction for the given review.
        """
        # Run a forward pass through the network, like in the "train" function.
        
        # Input Layer
        self.update_input_layer(review.lower())

        # Hidden layer
        layer_1 = self.layer_0.dot(self.weights_0_1)

        # Output layer
        layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))
        
        # Return POSITIVE for values above greater-than-or-equal-to 0.5 in the output layer;
        # return NEGATIVE for other values
        if(layer_2[0] >= 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"
        

Run the following cell to recreate the network and train it. Notice we've gone back to the higher learning rate of 0.1.

In [45]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.1)
mlp.train(reviews[:-1000],labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):170.0 #Correct:1796 #Trained:2501 Training Accuracy:71.8%
Progress:20.8% Speed(reviews/sec):167.1 #Correct:3780 #Trained:5001 Training Accuracy:75.5%
Progress:31.2% Speed(reviews/sec):165.0 #Correct:5865 #Trained:7501 Training Accuracy:78.1%
Progress:41.6% Speed(reviews/sec):162.2 #Correct:8020 #Trained:10001 Training Accuracy:80.1%
Progress:52.0% Speed(reviews/sec):160.1 #Correct:10148 #Trained:12501 Training Accuracy:81.1%
Progress:62.5% Speed(reviews/sec):159.9 #Correct:12289 #Trained:15001 Training Accuracy:81.9%
Progress:72.9% Speed(reviews/sec):160.3 #Correct:14394 #Trained:17501 Training Accuracy:82.2%
Progress:83.3% Speed(reviews/sec):160.2 #Correct:16577 #Trained:20001 Training Accuracy:82.8%
Progress:93.7% Speed(reviews/sec):160.2 #Correct:18763 #Trained:22501 Training Accuracy:83.3%
Progress:99.9% Speed(reviews/sec):160.3 #Correct:20084 #Trained:24000 Training Accuracy:83.6%
In [46]:
mlp.test(reviews[-1000:],labels[-1000:])
Progress:99.9% Speed(reviews/sec):1280. #Correct:857 #Tested:1000 Testing Accuracy:85.7%

Analyzing Inefficiencies in our Network

In [47]:
Image(filename='sentiment_network_sparse.png')
Out[47]:
In [48]:
layer_0 = np.zeros(10)
In [49]:
layer_0
Out[49]:
array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])
In [50]:
layer_0[4] = 1
layer_0[9] = 1
In [51]:
layer_0
Out[51]:
array([ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.])
In [52]:
weights_0_1 = np.random.randn(10,5)
In [53]:
layer_0.dot(weights_0_1)
Out[53]:
array([-0.10503756,  0.44222989,  0.24392938, -0.55961832,  0.21389503])
In [54]:
indices = [4,9]
In [55]:
layer_1 = np.zeros(5)
In [56]:
for index in indices:
    layer_1 += (1 * weights_0_1[index])
In [57]:
layer_1
Out[57]:
array([-0.10503756,  0.44222989,  0.24392938, -0.55961832,  0.21389503])
In [58]:
Image(filename='sentiment_network_sparse_2.png')
Out[58]:
In [59]:
layer_1 = np.zeros(5)
In [60]:
for index in indices:
    layer_1 += (weights_0_1[index])
In [61]:
layer_1
Out[61]:
array([-0.10503756,  0.44222989,  0.24392938, -0.55961832,  0.21389503])

Making our Network More Efficient

TODO: Make the SentimentNetwork class more efficient by eliminating unnecessary multiplications and additions that occur during forward and backward propagation. To do that, you can do the following:

  • Copy the SentimentNetwork class from the previous project into the following cell.
  • Remove the update_input_layer function - you will not need it in this version.
  • Modify init_network:
    • You no longer need a separate input layer, so remove any mention of self.layer_0
    • You will be dealing with the old hidden layer more directly, so create self.layer_1, a two-dimensional matrix with shape 1 x hidden_nodes, with all values initialized to zero
  • Modify train:
    • Change the name of the input parameter training_reviews to training_reviews_raw. This will help with the next step.
    • At the beginning of the function, you'll want to preprocess your reviews to convert them to a list of indices (from word2index) that are actually used in the review. This is equivalent to what you saw in the video when Andrew set specific indices to 1. Your code should create a local list variable named training_reviews that should contain a list for each review in training_reviews_raw. Those lists should contain the indices for words found in the review.
    • Remove call to update_input_layer
    • Use self's layer_1 instead of a local layer_1 object.
    • In the forward pass, replace the code that updates layer_1 with new logic that only adds the weights for the indices used in the review.
    • When updating weights_0_1, only update the individual weights that were used in the forward pass.
  • Modify run:
    • Remove call to update_input_layer
    • Use self's layer_1 instead of a local layer_1 object.
    • Much like you did in train, you will need to pre-process the review so you can work with word indices, then update layer_1 by adding weights for the indices used in the review.
In [62]:
import time
import sys
import numpy as np

# Encapsulate our neural network in a class
class SentimentNetwork:
    def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1):
        """Create a SentimenNetwork with the given settings
        Args:
            reviews(list) - List of reviews used for training
            labels(list) - List of POSITIVE/NEGATIVE labels associated with the given reviews
            hidden_nodes(int) - Number of nodes to create in the hidden layer
            learning_rate(float) - Learning rate to use while training
        
        """
        # Assign a seed to our random number generator to ensure we get
        # reproducable results during development 
        np.random.seed(1)

        # process the reviews and their associated labels so that everything
        # is ready for training
        self.pre_process_data(reviews, labels)
        
        # Build the network to have the number of hidden nodes and the learning rate that
        # were passed into this initializer. Make the same number of input nodes as
        # there are vocabulary words and create a single output node.
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

    def pre_process_data(self, reviews, labels):
        
        # populate review_vocab with all of the words in the given reviews
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)

        # Convert the vocabulary set to a list so we can access words via indices
        self.review_vocab = list(review_vocab)
        
        # populate label_vocab with all of the words in the given labels.
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        # Convert the label vocabulary set to a list so we can access labels via indices
        self.label_vocab = list(label_vocab)
        
        # Store the sizes of the review and label vocabularies.
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        # Create a dictionary of words in the vocabulary mapped to index positions
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        # Create a dictionary of labels mapped to index positions
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i

    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Store the learning rate
        self.learning_rate = learning_rate

        # Initialize weights

        # These are the weights between the input layer and the hidden layer.
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))

        # These are the weights between the hidden layer and the output layer.
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        ## New for Project 5: Removed self.layer_0; added self.layer_1
        # The input layer, a two-dimensional matrix with shape 1 x hidden_nodes
        self.layer_1 = np.zeros((1,hidden_nodes))
    
    ## New for Project 5: Removed update_input_layer function
    
    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    ## New for Project 5: changed name of first parameter form 'training_reviews' 
    #                     to 'training_reviews_raw'
    def train(self, training_reviews_raw, training_labels):

        ## New for Project 5: pre-process training reviews so we can deal 
        #                     directly with the indices of non-zero inputs
        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))

        # make sure out we have a matching number of reviews and labels
        assert(len(training_reviews) == len(training_labels))
        
        # Keep track of correct predictions to display accuracy during training 
        correct_so_far = 0

        # Remember when we started for printing time statistics
        start = time.time()
        
        # loop through all the given reviews and run a forward and backward pass,
        # updating weights for every item
        for i in range(len(training_reviews)):
            
            # Get the next review and its correct label
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            ## New for Project 5: Removed call to 'update_input_layer' function
            #                     because 'layer_0' is no longer used

            # Hidden layer
            ## New for Project 5: Add in only the weights for non-zero items
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]

            # Output layer
            ## New for Project 5: changed to use 'self.layer_1' instead of 'local layer_1'
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))            
            
            #### Implement the backward pass here ####
            ### Backward pass ###

            # Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            ## New for Project 5: changed to use 'self.layer_1' instead of local 'layer_1'
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            
            ## New for Project 5: Only update the weights that were used in the forward pass
            for index in review:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update input-to-hidden weights with gradient descent step

            # Keep track of correct predictions.
            if(layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            elif(layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the training process. 
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        """
        Attempts to predict the labels for the given testing_reviews,
        and uses the test_labels to calculate the accuracy of those predictions.
        """
        
        # keep track of how many correct predictions we make
        correct = 0

        # we'll time how many predictions per second we make
        start = time.time()

        # Loop through each of the given reviews and call run to predict
        # its label. 
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        """
        Returns a POSITIVE or NEGATIVE prediction for the given review.
        """
        # Run a forward pass through the network, like in the "train" function.
        
        ## New for Project 5: Removed call to update_input_layer function
        #                     because layer_0 is no longer used

        # Hidden layer
        ## New for Project 5: Identify the indices used in the review and then add
        #                     just those weights to layer_1 
        self.layer_1 *= 0
        unique_indices = set()
        for word in review.lower().split(" "):
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
        
        # Output layer
        ## New for Project 5: changed to use self.layer_1 instead of local layer_1
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
        
        # Return POSITIVE for values above greater-than-or-equal-to 0.5 in the output layer;
        # return NEGATIVE for other values
        if(layer_2[0] >= 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"

Run the following cell to recreate the network and train it once again.

In [63]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.1)
mlp.train(reviews[:-1000],labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):1166. #Correct:1694 #Trained:2501 Training Accuracy:67.7%
Progress:20.8% Speed(reviews/sec):1143. #Correct:3678 #Trained:5001 Training Accuracy:73.5%
Progress:31.2% Speed(reviews/sec):1142. #Correct:5752 #Trained:7501 Training Accuracy:76.6%
Progress:41.6% Speed(reviews/sec):1150. #Correct:7880 #Trained:10001 Training Accuracy:78.7%
Progress:52.0% Speed(reviews/sec):1145. #Correct:10014 #Trained:12501 Training Accuracy:80.1%
Progress:62.5% Speed(reviews/sec):1146. #Correct:12143 #Trained:15001 Training Accuracy:80.9%
Progress:72.9% Speed(reviews/sec):1145. #Correct:14269 #Trained:17501 Training Accuracy:81.5%
Progress:83.3% Speed(reviews/sec):1139. #Correct:16449 #Trained:20001 Training Accuracy:82.2%
Progress:93.7% Speed(reviews/sec):1132. #Correct:18629 #Trained:22501 Training Accuracy:82.7%
Progress:99.9% Speed(reviews/sec):1130. #Correct:19945 #Trained:24000 Training Accuracy:83.1%

That should have trained much better than the earlier attempts. Run the following cell to test your model with 1000 predictions.

In [64]:
mlp.test(reviews[-1000:],labels[-1000:])
Progress:99.9% Speed(reviews/sec):1623. #Correct:846 #Tested:1000 Testing Accuracy:84.6%

Further Noise Reduction

In [65]:
Image(filename='sentiment_network_sparse_2.png')
Out[65]:
In [66]:
# words most frequently seen in a review with a "POSITIVE" label
pos_neg_ratios.most_common()
Out[66]:
[('edie', 4.6913478822291435),
 ('paulie', 4.0775374439057197),
 ('felix', 3.1527360223636558),
 ('polanski', 2.8233610476132043),
 ('matthau', 2.8067217286092401),
 ('victoria', 2.6810215287142909),
 ('mildred', 2.6026896854443837),
 ('gandhi', 2.5389738710582761),
 ('flawless', 2.451005098112319),
 ('superbly', 2.2600254785752498),
 ('perfection', 2.1594842493533721),
 ('astaire', 2.1400661634962708),
 ('captures', 2.0386195471595809),
 ('voight', 2.0301704926730531),
 ('wonderfully', 2.0218960560332353),
 ('powell', 1.9783454248084671),
 ('brosnan', 1.9547990964725592),
 ('lily', 1.9203768470501485),
 ('bakshi', 1.9029851043382795),
 ('lincoln', 1.9014583864844796),
 ('refreshing', 1.8551812956655511),
 ('breathtaking', 1.8481124057791867),
 ('bourne', 1.8478489358790986),
 ('lemmon', 1.8458266904983307),
 ('delightful', 1.8002701588959635),
 ('flynn', 1.7996646487351682),
 ('andrews', 1.7764919970972666),
 ('homer', 1.7692866133759964),
 ('beautifully', 1.7626953362841438),
 ('soccer', 1.7578579175523736),
 ('elvira', 1.7397031072720019),
 ('underrated', 1.7197859696029656),
 ('gripping', 1.7165360479904674),
 ('superb', 1.7091514458966952),
 ('delight', 1.6714733033535532),
 ('welles', 1.6677068205580761),
 ('sadness', 1.663505133704376),
 ('sinatra', 1.6389967146756448),
 ('touching', 1.637217476541176),
 ('timeless', 1.62924053973028),
 ('macy', 1.6211339521972916),
 ('unforgettable', 1.6177367152487956),
 ('favorites', 1.6158688027643908),
 ('stewart', 1.6119987332957739),
 ('sullivan', 1.6094379124341003),
 ('extraordinary', 1.6094379124341003),
 ('hartley', 1.6094379124341003),
 ('brilliantly', 1.5950491749820008),
 ('friendship', 1.5677652160335325),
 ('wonderful', 1.5645425925262093),
 ('palma', 1.5553706911638245),
 ('magnificent', 1.54663701119507),
 ('finest', 1.5462590108125689),
 ('jackie', 1.5439233053234738),
 ('ritter', 1.5404450409471491),
 ('tremendous', 1.5184661342283736),
 ('freedom', 1.5091151908062312),
 ('fantastic', 1.5048433868558566),
 ('terrific', 1.5026699370083942),
 ('noir', 1.493925025312256),
 ('sidney', 1.493925025312256),
 ('outstanding', 1.4910053152089213),
 ('pleasantly', 1.4894785973551214),
 ('mann', 1.4894785973551214),
 ('nancy', 1.488077055429833),
 ('marie', 1.4825711915553104),
 ('marvelous', 1.4739999415389962),
 ('excellent', 1.4647538505723599),
 ('ruth', 1.4596256342054401),
 ('stanwyck', 1.4412101187160054),
 ('widmark', 1.4350845252893227),
 ('splendid', 1.4271163556401458),
 ('chan', 1.423108334242607),
 ('exceptional', 1.4201959127955721),
 ('tender', 1.410986973710262),
 ('gentle', 1.4078005663408544),
 ('poignant', 1.4022947024663317),
 ('gem', 1.3932148039644643),
 ('amazing', 1.3919815802404802),
 ('chilling', 1.3862943611198906),
 ('fisher', 1.3862943611198906),
 ('davies', 1.3862943611198906),
 ('captivating', 1.3862943611198906),
 ('darker', 1.3652409519220583),
 ('april', 1.3499267169490159),
 ('kelly', 1.3461743673304654),
 ('blake', 1.3418425985490567),
 ('overlooked', 1.329135947279942),
 ('ralph', 1.32818673031261),
 ('bette', 1.3156767939059373),
 ('hoffman', 1.3150668518315229),
 ('cole', 1.3121863889661687),
 ('shines', 1.3049487216659381),
 ('powerful', 1.2999662776313934),
 ('notch', 1.2950456896547455),
 ('remarkable', 1.2883688239495823),
 ('pitt', 1.286210902562908),
 ('winters', 1.2833463918674481),
 ('vivid', 1.2762934659055623),
 ('gritty', 1.2757524867200667),
 ('giallo', 1.2745029551317739),
 ('portrait', 1.2704625455947689),
 ('innocence', 1.2694300209805796),
 ('psychiatrist', 1.2685113254635072),
 ('favorite', 1.2668956297860055),
 ('ensemble', 1.2656663733312759),
 ('stunning', 1.2622417124499117),
 ('burns', 1.259880436264232),
 ('garbo', 1.258954938743289),
 ('barbara', 1.2580400255962119),
 ('philip', 1.2527629684953681),
 ('panic', 1.2527629684953681),
 ('holly', 1.2527629684953681),
 ('carol', 1.2481440226390734),
 ('perfect', 1.246742480713785),
 ('appreciated', 1.2462482874741743),
 ('favourite', 1.2411123512753928),
 ('journey', 1.2367626271489269),
 ('rural', 1.235471471385307),
 ('bond', 1.2321436812926323),
 ('builds', 1.2305398317106577),
 ('brilliant', 1.2287554137664785),
 ('brooklyn', 1.2286654169163074),
 ('von', 1.225175011976539),
 ('recommended', 1.2163953243244932),
 ('unfolds', 1.2163953243244932),
 ('daniel', 1.20215296760895),
 ('perfectly', 1.1971931173405572),
 ('crafted', 1.1962507582320256),
 ('prince', 1.1939224684724346),
 ('troubled', 1.192138346678933),
 ('consequences', 1.1865810616140668),
 ('haunting', 1.1814999484738773),
 ('cinderella', 1.180052620608284),
 ('alexander', 1.1759989522835299),
 ('emotions', 1.1753049094563641),
 ('boxing', 1.1735135968412274),
 ('subtle', 1.1734135017508081),
 ('curtis', 1.1649873576129823),
 ('rare', 1.1566438362402944),
 ('loved', 1.1563661500586044),
 ('daughters', 1.1526795099383853),
 ('courage', 1.1438688802562305),
 ('dentist', 1.1426722784621401),
 ('highly', 1.1420208631618658),
 ('nominated', 1.1409146683587992),
 ('tony', 1.1397491942285991),
 ('draws', 1.1325138403437911),
 ('everyday', 1.1306150197542835),
 ('contrast', 1.1284652518177909),
 ('cried', 1.1213405397456659),
 ('fabulous', 1.1210851445201684),
 ('ned', 1.120591195386885),
 ('fay', 1.120591195386885),
 ('emma', 1.1184149159642893),
 ('sensitive', 1.113318436057805),
 ('smooth', 1.1089750757036563),
 ('dramas', 1.1080910326226534),
 ('today', 1.1050431789984001),
 ('helps', 1.1023091505494358),
 ('inspiring', 1.0986122886681098),
 ('jimmy', 1.0937696641923216),
 ('awesome', 1.0931328229034842),
 ('unique', 1.0881409888008142),
 ('tragic', 1.0871835928444868),
 ('intense', 1.0870514662670339),
 ('stellar', 1.0857088838322018),
 ('rival', 1.0822184788924332),
 ('provides', 1.0797081340289569),
 ('depression', 1.0782034170369026),
 ('shy', 1.0775588794702773),
 ('carrie', 1.076139432816051),
 ('blend', 1.0753554265038423),
 ('hank', 1.0736109864626924),
 ('diana', 1.0726368022648489),
 ('adorable', 1.0726368022648489),
 ('unexpected', 1.0722255334949147),
 ('achievement', 1.0668635903535293),
 ('bettie', 1.0663514264498881),
 ('happiness', 1.0632729222228008),
 ('glorious', 1.0608719606852626),
 ('davis', 1.0541605260972757),
 ('terrifying', 1.0525211814678428),
 ('beauty', 1.050410186850232),
 ('ideal', 1.0479685558493548),
 ('fears', 1.0467872208035236),
 ('hong', 1.0438040521731147),
 ('seasons', 1.0433496099930604),
 ('fascinating', 1.0414538748281612),
 ('carries', 1.0345904299031787),
 ('satisfying', 1.0321225473992768),
 ('definite', 1.0319209141694374),
 ('touched', 1.0296194171811581),
 ('greatest', 1.0248947127715422),
 ('creates', 1.0241097613701886),
 ('aunt', 1.023388867430522),
 ('walter', 1.022328983918479),
 ('spectacular', 1.0198314108149955),
 ('portrayal', 1.0189810189761024),
 ('ann', 1.0127808528183286),
 ('enterprise', 1.0116009116784799),
 ('musicals', 1.0096648026516135),
 ('deeply', 1.0094845087721023),
 ('incredible', 1.0061677561461084),
 ('mature', 1.0060195018402847),
 ('triumph', 0.99682959435816731),
 ('margaret', 0.99682959435816731),
 ('navy', 0.99493385919326827),
 ('harry', 0.99176919305006062),
 ('lucas', 0.990398704027877),
 ('sweet', 0.98966110487955483),
 ('joey', 0.98794672078059009),
 ('oscar', 0.98721905111049713),
 ('balance', 0.98649499054740353),
 ('warm', 0.98485340331145166),
 ('ages', 0.98449898190068863),
 ('guilt', 0.98082925301172619),
 ('glover', 0.98082925301172619),
 ('carrey', 0.98082925301172619),
 ('learns', 0.97881108885548895),
 ('unusual', 0.97788374278196932),
 ('sons', 0.97777581552483595),
 ('complex', 0.97761897738147796),
 ('essence', 0.97753435711487369),
 ('brazil', 0.9769153536905899),
 ('widow', 0.97650959186720987),
 ('solid', 0.97537964824416146),
 ('beautiful', 0.97326301262841053),
 ('holmes', 0.97246100334120955),
 ('awe', 0.97186058302896583),
 ('vhs', 0.97116734209998934),
 ('eerie', 0.97116734209998934),
 ('lonely', 0.96873720724669754),
 ('grim', 0.96873720724669754),
 ('sport', 0.96825047080486615),
 ('debut', 0.96508089604358704),
 ('destiny', 0.96343751029985703),
 ('thrillers', 0.96281074750904794),
 ('tears', 0.95977584381389391),
 ('rose', 0.95664202739772253),
 ('feelings', 0.95551144502743635),
 ('ginger', 0.95551144502743635),
 ('winning', 0.95471810900804055),
 ('stanley', 0.95387344302319799),
 ('cox', 0.95343027882361187),
 ('paris', 0.95278479030472663),
 ('heart', 0.95238806924516806),
 ('hooked', 0.95155887071161305),
 ('comfortable', 0.94803943018873538),
 ('mgm', 0.94446160884085151),
 ('masterpiece', 0.94155039863339296),
 ('themes', 0.94118828349588235),
 ('danny', 0.93967118051821874),
 ('anime', 0.93378388932167222),
 ('perry', 0.93328830824272613),
 ('joy', 0.93301752567946861),
 ('lovable', 0.93081883243706487),
 ('mysteries', 0.92953595862417571),
 ('hal', 0.92953595862417571),
 ('louis', 0.92871325187271225),
 ('charming', 0.92520609553210742),
 ('urban', 0.92367083917177761),
 ('allows', 0.92183091224977043),
 ('impact', 0.91815814604895041),
 ('italy', 0.91629073187415511),
 ('gradually', 0.91629073187415511),
 ('lifestyle', 0.91629073187415511),
 ('spy', 0.91289514287301687),
 ('treat', 0.91193342650519937),
 ('subsequent', 0.91056005716517008),
 ('kennedy', 0.90981821736853763),
 ('loving', 0.90967549275543591),
 ('surprising', 0.90937028902958128),
 ('quiet', 0.90648673177753425),
 ('winter', 0.90624039602065365),
 ('reveals', 0.90490540964902977),
 ('raw', 0.90445627422715225),
 ('funniest', 0.90078654533818991),
 ('pleased', 0.89994159387262562),
 ('norman', 0.89994159387262562),
 ('thief', 0.89874642222324552),
 ('season', 0.89827222637147675),
 ('secrets', 0.89794159320595857),
 ('colorful', 0.89705936994626756),
 ('highest', 0.8967461358011849),
 ('compelling', 0.89462923509297576),
 ('danes', 0.89248008318043659),
 ('castle', 0.88967708335606499),
 ('kudos', 0.88889175768604067),
 ('great', 0.88810470901464589),
 ('baseball', 0.88730319500090271),
 ('subtitles', 0.88730319500090271),
 ('bleak', 0.88730319500090271),
 ('winner', 0.88643776872447388),
 ('tragedy', 0.88563699078315261),
 ('todd', 0.88551907320740142),
 ('nicely', 0.87924946019380601),
 ('arthur', 0.87546873735389985),
 ('essential', 0.87373111745535925),
 ('gorgeous', 0.8731725250935497),
 ('fonda', 0.87294029100054127),
 ('eastwood', 0.87139541196626402),
 ('focuses', 0.87082835779739776),
 ('enjoyed', 0.87070195951624607),
 ('natural', 0.86997924506912838),
 ('intensity', 0.86835126958503595),
 ('witty', 0.86824103423244681),
 ('rob', 0.8642954367557748),
 ('worlds', 0.86377269759070874),
 ('health', 0.86113891179907498),
 ('magical', 0.85953791528170564),
 ('deeper', 0.85802182375017932),
 ('lucy', 0.85618680780444956),
 ('moving', 0.85566611005772031),
 ('lovely', 0.85290640004681306),
 ('purple', 0.8513711857748395),
 ('memorable', 0.84801189112086062),
 ('sings', 0.84729786038720367),
 ('craig', 0.84342938360928321),
 ('modesty', 0.84342938360928321),
 ('relate', 0.84326559685926517),
 ('episodes', 0.84223712084137292),
 ('strong', 0.84167135777060931),
 ('smith', 0.83959811108590054),
 ('tear', 0.83704136022001441),
 ('apartment', 0.83333115290549531),
 ('princess', 0.83290912293510388),
 ('disagree', 0.83290912293510388),
 ('kung', 0.83173334384609199),
 ('adventure', 0.83150561393278388),
 ('columbo', 0.82667857318446791),
 ('jake', 0.82667857318446791),
 ('adds', 0.82485652591452319),
 ('hart', 0.82472353834866463),
 ('strength', 0.82417544296634937),
 ('realizes', 0.82360006895738058),
 ('dave', 0.8232003088081431),
 ('childhood', 0.82208086393583857),
 ('forbidden', 0.81989888619908913),
 ('tight', 0.81883539572344199),
 ('surreal', 0.8178506590609026),
 ('manager', 0.81770990320170756),
 ('dancer', 0.81574950265227764),
 ('studios', 0.81093021621632877),
 ('con', 0.81093021621632877),
 ('miike', 0.80821651034473263),
 ('realistic', 0.80807714723392232),
 ('explicit', 0.80792269515237358),
 ('kurt', 0.8060875917405409),
 ('traditional', 0.80535917116687328),
 ('deals', 0.80535917116687328),
 ('holds', 0.80493858654806194),
 ('carl', 0.80437281567016972),
 ('touches', 0.80396154690023547),
 ('gene', 0.80314807577427383),
 ('albert', 0.8027669055771679),
 ('abc', 0.80234647252493729),
 ('cry', 0.80011930011211307),
 ('sides', 0.7995275841185171),
 ('develops', 0.79850769621777162),
 ('eyre', 0.79850769621777162),
 ('dances', 0.79694397424158891),
 ('oscars', 0.79633141679517616),
 ('legendary', 0.79600456599965308),
 ('hearted', 0.79492987486988764),
 ('importance', 0.79492987486988764),
 ('portraying', 0.79356592830699269),
 ('impressed', 0.79258107754813223),
 ('waters', 0.79112758892014912),
 ('empire', 0.79078565012386137),
 ('edge', 0.789774016249017),
 ('jean', 0.78845736036427028),
 ('environment', 0.78845736036427028),
 ('sentimental', 0.7864791203521645),
 ('captured', 0.78623760362595729),
 ('styles', 0.78592891401091158),
 ('daring', 0.78592891401091158),
 ('frank', 0.78275933924963248),
 ('tense', 0.78275933924963248),
 ('backgrounds', 0.78275933924963248),
 ('matches', 0.78275933924963248),
 ('gothic', 0.78209466657644144),
 ('sharp', 0.7814397877056235),
 ('achieved', 0.78015855754957497),
 ('court', 0.77947526404844247),
 ('steals', 0.7789140023173704),
 ('rules', 0.77844476107184035),
 ('colors', 0.77684619943659217),
 ('reunion', 0.77318988823348167),
 ('covers', 0.77139937745969345),
 ('tale', 0.77010822169607374),
 ('rain', 0.7683706017975328),
 ('denzel', 0.76804848873306297),
 ('stays', 0.76787072675588186),
 ('blob', 0.76725515271366718),
 ('maria', 0.76214005204689672),
 ('conventional', 0.76214005204689672),
 ('fresh', 0.76158434211317383),
 ('midnight', 0.76096977689870637),
 ('landscape', 0.75852993982279704),
 ('animated', 0.75768570169751648),
 ('titanic', 0.75666058628227129),
 ('sunday', 0.75666058628227129),
 ('spring', 0.7537718023763802),
 ('cagney', 0.7537718023763802),
 ('enjoyable', 0.75246375771636476),
 ('immensely', 0.75198768058287868),
 ('sir', 0.7507762933965817),
 ('nevertheless', 0.75067102469813185),
 ('driven', 0.74994477895307854),
 ('performances', 0.74883252516063137),
 ('memories', 0.74721440183022114),
 ('nowadays', 0.74721440183022114),
 ('simple', 0.74641420974143258),
 ('golden', 0.74533293373051557),
 ('leslie', 0.74533293373051557),
 ('lovers', 0.74497224842453125),
 ('relationship', 0.74484232345601786),
 ('supporting', 0.74357803418683721),
 ('che', 0.74262723782331497),
 ('packed', 0.7410032017375805),
 ('trek', 0.74021469141793106),
 ('provoking', 0.73840377214806618),
 ('strikes', 0.73759894313077912),
 ('depiction', 0.73682224406260699),
 ('emotional', 0.73678211645681524),
 ('secretary', 0.7366322924996842),
 ('influenced', 0.73511137965897755),
 ('florida', 0.73511137965897755),
 ('germany', 0.73288750920945944),
 ('brings', 0.73142936713096229),
 ('lewis', 0.73129894652432159),
 ('elderly', 0.73088750854279239),
 ('owner', 0.72743625403857748),
 ('streets', 0.72666987259858895),
 ('henry', 0.72642196944481741),
 ('portrays', 0.72593700338293632),
 ('bears', 0.7252354951114458),
 ('china', 0.72489587887452556),
 ('anger', 0.72439972406404984),
 ('society', 0.72433010799663333),
 ('available', 0.72415741730250549),
 ('best', 0.72347034060446314),
 ('bugs', 0.72270598280148979),
 ('magic', 0.71878961117328299),
 ('delivers', 0.71846498854423513),
 ('verhoeven', 0.71846498854423513),
 ('jim', 0.71783979315031676),
 ('donald', 0.71667767797013937),
 ('endearing', 0.71465338578090898),
 ('relationships', 0.71393795022901896),
 ('greatly', 0.71256526641704687),
 ('charlie', 0.71024161391924534),
 ('brad', 0.71024161391924534),
 ('simon', 0.70967648251115578),
 ('effectively', 0.70914752190638641),
 ('march', 0.70774597998109789),
 ('atmosphere', 0.70744773070214162),
 ('influence', 0.70733181555190172),
 ('genius', 0.706392407309966),
 ('emotionally', 0.70556970055850243),
 ('ken', 0.70526854109229009),
 ('identity', 0.70484322032313651),
 ('sophisticated', 0.70470800296102132),
 ('dan', 0.70457587638356811),
 ('andrew', 0.70329955202396321),
 ('india', 0.70144598337464037),
 ('roy', 0.69970458110610434),
 ('surprisingly', 0.6995780708902356),
 ('sky', 0.69780919366575667),
 ('romantic', 0.69664981111114743),
 ('match', 0.69566924999265523),
 ('meets', 0.69314718055994529),
 ('cowboy', 0.69314718055994529),
 ('wave', 0.69314718055994529),
 ('bitter', 0.69314718055994529),
 ('patient', 0.69314718055994529),
 ('stylish', 0.69314718055994529),
 ('britain', 0.69314718055994529),
 ('affected', 0.69314718055994529),
 ('beatty', 0.69314718055994529),
 ('love', 0.69198533541937324),
 ('paul', 0.68980827929443067),
 ('andy', 0.68846333124751902),
 ('performance', 0.68797386327972465),
 ('patrick', 0.68645819240914863),
 ('unlike', 0.68546468438792907),
 ('brooks', 0.68433655087779044),
 ('refuses', 0.68348526964820844),
 ('award', 0.6824518914431974),
 ('complaint', 0.6824518914431974),
 ('ride', 0.68229716453587952),
 ('dawson', 0.68171848473632257),
 ('luke', 0.68158635815886937),
 ('wells', 0.68087708796813096),
 ('france', 0.6804081547825156),
 ('sports', 0.68007509899259255),
 ('handsome', 0.68007509899259255),
 ('directs', 0.67875844310784572),
 ('rebel', 0.67875844310784572),
 ('greater', 0.67605274720064523),
 ('dreams', 0.67599410133369586),
 ('effective', 0.67565402311242806),
 ('interpretation', 0.67479804189174875),
 ('works', 0.67445504754779284),
 ('brando', 0.67445504754779284),
 ('noble', 0.6737290947028437),
 ('paced', 0.67314651385327573),
 ('le', 0.67067432470788668),
 ('master', 0.67015766233524654),
 ('h', 0.6696166831497512),
 ('rings', 0.66904962898088483),
 ('easy', 0.66895995494594152),
 ('city', 0.66820823221269321),
 ('sunshine', 0.66782937257565544),
 ('succeeds', 0.66647893347778397),
 ('relations', 0.664159643686693),
 ('england', 0.66387679825983203),
 ('glimpse', 0.66329421741026418),
 ('aired', 0.66268797307523675),
 ('sees', 0.66263163663399482),
 ('both', 0.66248336767382998),
 ('definitely', 0.66199789483898808),
 ('imaginative', 0.66139848224536502),
 ('appreciate', 0.66083893732728749),
 ('tricks', 0.66071190480679143),
 ('striking', 0.66071190480679143),
 ('carefully', 0.65999497324304479),
 ('complicated', 0.65981076029235353),
 ('perspective', 0.65962448852130173),
 ('trilogy', 0.65877953705573755),
 ('future', 0.65834665141052828),
 ('lion', 0.65742909795786608),
 ('douglas', 0.65540685257709819),
 ('victor', 0.65540685257709819),
 ('inspired', 0.65459851044271034),
 ('marriage', 0.65392646740666405),
 ('demands', 0.65392646740666405),
 ('father', 0.65172321672194655),
 ('page', 0.65123628494430852),
 ('instant', 0.65058756614114943),
 ('era', 0.6495567444850836),
 ('ruthless', 0.64934455790155243),
 ('saga', 0.64934455790155243),
 ('joan', 0.64891392558311978),
 ('joseph', 0.64841128671855386),
 ('workers', 0.64829661439459352),
 ('fantasy', 0.64726757480925168),
 ('distant', 0.64551913157069074),
 ('accomplished', 0.64551913157069074),
 ('manhattan', 0.64435701639051324),
 ('personal', 0.64355023942057321),
 ('meeting', 0.64313675998528386),
 ('individual', 0.64313675998528386),
 ('pushing', 0.64313675998528386),
 ('pleasant', 0.64250344774119039),
 ('brave', 0.64185388617239469),
 ('william', 0.64083139119578469),
 ('hudson', 0.64077919504262937),
 ('friendly', 0.63949446706762514),
 ('eccentric', 0.63907995928966954),
 ('awards', 0.63875310849414646),
 ('jack', 0.63838309514997038),
 ('seeking', 0.63808740337691783),
 ('divorce', 0.63757732940513456),
 ('colonel', 0.63757732940513456),
 ('jane', 0.63443957973316734),
 ('keeping', 0.63414883979798953),
 ('gives', 0.63383568159497883),
 ('ted', 0.63342794585832296),
 ('animation', 0.63208692379869902),
 ('progress', 0.6317782341836532),
 ('larger', 0.63127177684185776),
 ('concert', 0.63127177684185776),
 ('nation', 0.6296337748376194),
 ('albeit', 0.62739580299716491),
 ('adapted', 0.62613647027698516),
 ('discovers', 0.62542900650499444),
 ('classic', 0.62504956428050518),
 ('segment', 0.62335141862440335),
 ('morgan', 0.62303761437291871),
 ('mouse', 0.62294292188669675),
 ('impressive', 0.62211140744319349),
 ('artist', 0.62168821657780038),
 ('ultimate', 0.62168821657780038),
 ('griffith', 0.62117368093485603),
 ('drew', 0.62082651898031915),
 ('emily', 0.62082651898031915),
 ('moved', 0.6197197120051281),
 ('families', 0.61903920840622351),
 ('profound', 0.61903920840622351),
 ('innocent', 0.61851219917136446),
 ('versions', 0.61730910416844087),
 ('eddie', 0.61691981517206107),
 ('criticism', 0.61651395453902935),
 ('nature', 0.61594514653194088),
 ('recognized', 0.61518563909023349),
 ('sexuality', 0.61467556511845012),
 ('contract', 0.61400986000122149),
 ('brian', 0.61344043794920278),
 ('remembered', 0.6131044728864089),
 ('determined', 0.6123858239154869),
 ('offers', 0.61207935747116349),
 ('pleasure', 0.61195702582993206),
 ('washington', 0.61180154110599294),
 ('images', 0.61159731359583758),
 ('games', 0.61067095873570676),
 ('academy', 0.60872983874736208),
 ('fashioned', 0.60798937221963845),
 ('melodrama', 0.60749173598145145),
 ('rough', 0.60613580357031549),
 ('charismatic', 0.60613580357031549),
 ('peoples', 0.60613580357031549),
 ('dealing', 0.60517840761398811),
 ('fine', 0.60496962268013299),
 ('tap', 0.60391604683200273),
 ('trio', 0.60157998703445481),
 ('russell', 0.60120968523425966),
 ('figures', 0.60077386042893011),
 ('ward', 0.60005675749393339),
 ('shine', 0.59911823091166894),
 ('brady', 0.59911823091166894),
 ('job', 0.59845562125168661),
 ('satisfied', 0.59652034487087369),
 ('river', 0.59637962862495086),
 ('brown', 0.595773016534769),
 ('believable', 0.59566072133302495),
 ('always', 0.59470710774669278),
 ('bound', 0.59470710774669278),
 ('hall', 0.5933967777928858),
 ('cook', 0.5916777203950857),
 ('claire', 0.59136448625000293),
 ('broadway', 0.59033768669372433),
 ('anna', 0.58778666490211906),
 ('peace', 0.58628403501758408),
 ('visually', 0.58539431926349916),
 ('morality', 0.58525821854876026),
 ('falk', 0.58525821854876026),
 ('growing', 0.58466653756587539),
 ('experiences', 0.58314628534561685),
 ('stood', 0.58314628534561685),
 ('touch', 0.58122926435596001),
 ('lives', 0.5810976767513224),
 ('kubrick', 0.58066919713325493),
 ('timing', 0.58047401805583243),
 ('expressions', 0.57981849525294216),
 ('struggles', 0.57981849525294216),
 ('authentic', 0.57848427223980559),
 ('helen', 0.57763429343810091),
 ('pre', 0.57700753064729182),
 ('quirky', 0.5753641449035618),
 ('young', 0.57531672344534313),
 ('inner', 0.57454143815209846),
 ('mexico', 0.57443087372056334),
 ('clint', 0.57380042292737909),
 ('sisters', 0.57286101468544337),
 ('realism', 0.57226528899949558),
 ('french', 0.5720692490067093),
 ('personalities', 0.5720692490067093),
 ('surprises', 0.57113222999698177),
 ('adventures', 0.57113222999698177),
 ('overcome', 0.5697681593994407),
 ('timothy', 0.56953322459276867),
 ('tales', 0.56909453188996639),
 ('war', 0.56843317302781682),
 ('civil', 0.5679840376059393),
 ('countries', 0.56737779327091187),
 ('streep', 0.56710645966458029),
 ('tradition', 0.56685345523565323),
 ('oliver', 0.56673325570428668),
 ('australia', 0.56580775818334383),
 ('understanding', 0.56531380905006046),
 ('players', 0.56509525370004821),
 ('knowing', 0.56489284503626647),
 ('rogers', 0.56421349718405212),
 ('suspenseful', 0.56368911332305849),
 ('variety', 0.56368911332305849),
 ('true', 0.56281525180810066),
 ('jr', 0.56220982311246936),
 ('psychological', 0.56108745854687891),
 ('sent', 0.55961578793542266),
 ('grand', 0.55961578793542266),
 ('branagh', 0.55961578793542266),
 ('reminiscent', 0.55961578793542266),
 ('performing', 0.55961578793542266),
 ('wealth', 0.55961578793542266),
 ('overwhelming', 0.55961578793542266),
 ('odds', 0.55961578793542266),
 ('brothers', 0.55891181043362848),
 ('howard', 0.55811089675600245),
 ('david', 0.55693122256475369),
 ('generation', 0.55628799784274796),
 ('grow', 0.55612538299565417),
 ('survival', 0.55594605904646033),
 ('mainstream', 0.55574731115750231),
 ('dick', 0.55431073570572953),
 ('charm', 0.55288175575407861),
 ('kirk', 0.55278982286502287),
 ('twists', 0.55244729845681018),
 ('gangster', 0.55206858230003986),
 ('jeff', 0.55179306225421365),
 ('family', 0.55116244510065526),
 ('tend', 0.55053307336110335),
 ('thanks', 0.55049088015842218),
 ('world', 0.54744234723432639),
 ('sutherland', 0.54743536937855164),
 ('life', 0.54695514434959924),
 ('disc', 0.54654370636806993),
 ('bug', 0.54654370636806993),
 ('tribute', 0.5455111817538808),
 ('europe', 0.54522705048332309),
 ('sacrifice', 0.54430155296238014),
 ('color', 0.54405127139431109),
 ('superior', 0.54333490233128523),
 ('york', 0.54318235866536513),
 ('pulls', 0.54266622962164945),
 ('jackson', 0.54232429082536171),
 ('hearts', 0.54232429082536171),
 ('enjoy', 0.54124285135906114),
 ('redemption', 0.54056759296472823),
 ('madness', 0.540384426007535),
 ('stands', 0.5389965007326869),
 ('trial', 0.5389965007326869),
 ('greek', 0.5389965007326869),
 ('hamilton', 0.5389965007326869),
 ('each', 0.5388212312554177),
 ('faithful', 0.53773307668591508),
 ('received', 0.5372768098531604),
 ('documentaries', 0.53714293208336406),
 ('jealous', 0.53714293208336406),
 ('different', 0.53709860682460819),
 ('describes', 0.53680111016925136),
 ('shorts', 0.53596159703753288),
 ('brilliance', 0.53551823635636209),
 ('mountains', 0.53492317534505118),
 ('share', 0.53408248593025787),
 ('dealt', 0.53408248593025787),
 ('providing', 0.53329847961804933),
 ('explore', 0.53329847961804933),
 ('series', 0.5325809226575603),
 ('fellow', 0.5323318289869543),
 ('loves', 0.53062825106217038),
 ('revolution', 0.53062825106217038),
 ('olivier', 0.53062825106217038),
 ('roman', 0.53062825106217038),
 ('century', 0.53002783074992665),
 ('musical', 0.52966871156747064),
 ('heroic', 0.52925932545482868),
 ('approach', 0.52806743020049673),
 ('ironically', 0.52806743020049673),
 ('temple', 0.52806743020049673),
 ('moves', 0.5279372642387119),
 ('gift', 0.52702030968597136),
 ('julie', 0.52609309589677911),
 ('tells', 0.52415107836314001),
 ('radio', 0.52394671172868779),
 ('uncle', 0.52354439617376536),
 ('union', 0.52324814376454787),
 ('deep', 0.52309571635780505),
 ('reminds', 0.52157841554225237),
 ('famous', 0.52118841080153722),
 ('jazz', 0.52053443789295151),
 ('dennis', 0.51987545928590861),
 ('epic', 0.51919387343650736),
 ('adult', 0.519167695083386),
 ('shows', 0.51915322220375304),
 ('performed', 0.5191244265806858),
 ('demons', 0.5191244265806858),
 ('discovered', 0.51879379341516751),
 ('eric', 0.51879379341516751),
 ('youth', 0.5185626062681431),
 ('human', 0.51851411224987087),
 ('tarzan', 0.51813827061227724),
 ('ourselves', 0.51794309153485463),
 ('wwii', 0.51758240622887042),
 ('passion', 0.5162164724008671),
 ('desire', 0.51607497965213445),
 ('pays', 0.51581316527702981),
 ('dirty', 0.51557622652458857),
 ('fox', 0.51557622652458857),
 ('sympathetic', 0.51546600332249293),
 ('symbolism', 0.51546600332249293),
 ('attitude', 0.51530993621331933),
 ('appearances', 0.51466440007315639),
 ('jeremy', 0.51466440007315639),
 ('fun', 0.51439068993048687),
 ('south', 0.51420972175023116),
 ('arrives', 0.51409894911095988),
 ('present', 0.51341965894303732),
 ('com', 0.51326167856387173),
 ('smile', 0.51265880484765169),
 ('alan', 0.51082562376599072),
 ('ring', 0.51082562376599072),
 ('visit', 0.51082562376599072),
 ('fits', 0.51082562376599072),
 ('provided', 0.51082562376599072),
 ('carter', 0.51082562376599072),
 ('aging', 0.51082562376599072),
 ('countryside', 0.51082562376599072),
 ('begins', 0.51015650363396647),
 ('success', 0.50900578704900468),
 ('japan', 0.50900578704900468),
 ('accurate', 0.50895471583017893),
 ('proud', 0.50800474742434931),
 ('daily', 0.5075946031845443),
 ('karloff', 0.50724780241810674),
 ('atmospheric', 0.50724780241810674),
 ('recently', 0.50714914903668207),
 ('fu', 0.50704490092608467),
 ('horrors', 0.50656122497953315),
 ('finding', 0.50637127341661037),
 ('lust', 0.5059356384717989),
 ('hitchcock', 0.50574947073413001),
 ('among', 0.50334004951332734),
 ('viewing', 0.50302139827440906),
 ('investigation', 0.50262885656181222),
 ('shining', 0.50262885656181222),
 ('duo', 0.5020919437972361),
 ('cameron', 0.5020919437972361),
 ('finds', 0.50128303100539795),
 ('contemporary', 0.50077528791248915),
 ('genuine', 0.50046283673044401),
 ('frightening', 0.49995595152908684),
 ('plays', 0.49975983848890226),
 ('age', 0.49941323171424595),
 ('position', 0.49899116611898781),
 ('continues', 0.49863035067217237),
 ('roles', 0.49839716550752178),
 ('james', 0.49837216269470402),
 ('individuals', 0.49824684155913052),
 ('brought', 0.49783842823917956),
 ('hilarious', 0.49714551986191058),
 ('brutal', 0.49681488669639234),
 ('appropriate', 0.49643688631389105),
 ('dance', 0.49581998314812048),
 ('league', 0.49578774640145024),
 ('helping', 0.49578774640145024),
 ('answers', 0.49578774640145024),
 ('stunts', 0.49561620510246196),
 ('traveling', 0.49532143723002542),
 ('thoroughly', 0.49414593456733524),
 ('depicted', 0.49317068852726992),
 ('combination', 0.49247648509779424),
 ('honor', 0.49247648509779424),
 ('differences', 0.49247648509779424),
 ('fully', 0.49213349075383811),
 ('tracy', 0.49159426183810306),
 ('battles', 0.49140753790888908),
 ('possibility', 0.49112055268665822),
 ('romance', 0.4901589869574316),
 ('initially', 0.49002249613622745),
 ('happy', 0.4898997500608791),
 ('crime', 0.48977221456815834),
 ('singing', 0.4893852925281213),
 ('especially', 0.48901267837860624),
 ('shakespeare', 0.48754793889664511),
 ('hugh', 0.48729512635579658),
 ('detail', 0.48609484250827351),
 ('julia', 0.48550781578170082),
 ('san', 0.48550781578170082),
 ('guide', 0.48550781578170082),
 ('desperation', 0.48550781578170082),
 ('companion', 0.48550781578170082),
 ('strongly', 0.48460242866688824),
 ('necessary', 0.48302334245403883),
 ('humanity', 0.48265474679929443),
 ('drama', 0.48221998493060503),
 ('nonetheless', 0.48183808689273838),
 ('intrigue', 0.48183808689273838),
 ('warming', 0.48183808689273838),
 ('cuba', 0.48183808689273838),
 ('planned', 0.47957308026188628),
 ('pictures', 0.47929937011921681),
 ('broadcast', 0.47849024312305422),
 ('nine', 0.47803580094299974),
 ('settings', 0.47743860773325364),
 ('history', 0.47732966933780852),
 ('ordinary', 0.47725880012690741),
 ('trade', 0.47692407209030935),
 ('official', 0.47608267532211779),
 ('primary', 0.47608267532211779),
 ('episode', 0.47529620261150429),
 ('role', 0.47520268270188676),
 ('spirit', 0.47477690799839323),
 ('grey', 0.47409361449726067),
 ('ways', 0.47323464982718205),
 ('cup', 0.47260441094579297),
 ('piano', 0.47260441094579297),
 ('familiar', 0.47241617565111949),
 ('sinister', 0.47198579044972683),
 ('reveal', 0.47171449364936496),
 ('max', 0.47150852042515579),
 ('dated', 0.47121648567094482),
 ('losing', 0.47000362924573563),
 ('discovery', 0.47000362924573563),
 ('vicious', 0.47000362924573563),
 ('genuinely', 0.46871413841586385),
 ('hatred', 0.46734051182625186),
 ('mistaken', 0.46702300110759781),
 ('dream', 0.46608972992459924),
 ('challenge', 0.46608972992459924),
 ('crisis', 0.46575733836428446),
 ('photographed', 0.46488852857896512),
 ('critics', 0.46430560813109778),
 ('bird', 0.46430560813109778),
 ('machines', 0.46430560813109778),
 ('born', 0.46411383518967209),
 ('detective', 0.4636633473511525),
 ('higher', 0.46328467899699055),
 ('remains', 0.46262352194811296),
 ('inevitable', 0.46262352194811296),
 ('soviet', 0.4618180446592961),
 ('ryan', 0.46134556650262099),
 ('african', 0.46112595521371813),
 ('smaller', 0.46081520319132935),
 ('techniques', 0.46052488529119184),
 ('information', 0.46034171833399862),
 ('deserved', 0.45999798712841444),
 ('lynch', 0.45953232937844013),
 ('spielberg', 0.45953232937844013),
 ('cynical', 0.45953232937844013),
 ('tour', 0.45953232937844013),
 ('francisco', 0.45953232937844013),
 ('struggle', 0.45911782160048453),
 ('language', 0.45902121257712653),
 ('visual', 0.45823514408822852),
 ('warner', 0.45724137763188427),
 ('social', 0.45720078250735313),
 ('reality', 0.45719346885019546),
 ('hidden', 0.45675840249571492),
 ('breaking', 0.45601738727099561),
 ('sometimes', 0.45563021171182794),
 ('modern', 0.45500247579345005),
 ('surfing', 0.45425527227759638),
 ('popular', 0.45410691533051023),
 ('surprised', 0.4534409399850382),
 ('follows', 0.45245361754408348),
 ('keeps', 0.45234869400701483),
 ('john', 0.4520909494482197),
 ('mixed', 0.45198512374305722),
 ('defeat', 0.45198512374305722),
 ('justice', 0.45142724367280018),
 ('treasure', 0.45083371313801535),
 ('presents', 0.44973793178615257),
 ('years', 0.44919197032104968),
 ('chief', 0.44895022004790319),
 ('shadows', 0.44802472252696035),
 ('closely', 0.44701411102103689),
 ('segments', 0.44701411102103689),
 ('lose', 0.44658335503763702),
 ('caine', 0.44628710262841953),
 ('caught', 0.44610275383999071),
 ('hamlet', 0.44558510189758965),
 ('chinese', 0.44507424620321018),
 ('welcome', 0.44438052435783792),
 ('birth', 0.44368632092836219),
 ('represents', 0.44320543609101143),
 ('puts', 0.44279106572085081),
 ('visuals', 0.44183275227903923),
 ('fame', 0.44183275227903923),
 ('closer', 0.44183275227903923),
 ('web', 0.44183275227903923),
 ('criminal', 0.4412745608048752),
 ('minor', 0.4409224199448939),
 ('jon', 0.44086703515908027),
 ('liked', 0.44074991514020723),
 ('restaurant', 0.44031183943833246),
 ('de', 0.43983275161237217),
 ('flaws', 0.43983275161237217),
 ('searching', 0.4393666597838457),
 ('rap', 0.43891304217570443),
 ('light', 0.43884433018199892),
 ('elizabeth', 0.43872232986464677),
 ('marry', 0.43861731542506488),
 ('learned', 0.43825493093115531),
 ('controversial', 0.43825493093115531),
 ('oz', 0.43825493093115531),
 ('slowly', 0.43785660389939979),
 ('comedic', 0.43721380642274466),
 ('wayne', 0.43721380642274466),
 ('thrilling', 0.43721380642274466),
 ('bridge', 0.43721380642274466),
 ('married', 0.43658501682196887),
 ('nazi', 0.4361020775700542),
 ('murder', 0.4353180712578455),
 ('physical', 0.4353180712578455),
 ('johnny', 0.43483971678806865),
 ('michelle', 0.43445264498141672),
 ('wallace', 0.43403848055222038),
 ('comedies', 0.43395706390247063),
 ('silent', 0.43395706390247063),
 ('played', 0.43387244114515305),
 ('international', 0.43363598507486073),
 ('vision', 0.43286408229627887),
 ('intelligent', 0.43196704885367099),
 ('shop', 0.43078291609245434),
 ('also', 0.43036720209769169),
 ('levels', 0.4302451371066513),
 ('miss', 0.43006426712153217),
 ('movement', 0.4295626596872249),
 ...]
In [67]:
# words most frequently seen in a review with a "NEGATIVE" label
list(reversed(pos_neg_ratios.most_common()))[0:30]
Out[67]:
[('boll', -4.0778152602708904),
 ('uwe', -3.9218753018711578),
 ('seagal', -3.3202501058581921),
 ('unwatchable', -3.0269848170580955),
 ('stinker', -2.9876839403711624),
 ('mst', -2.7753833211707968),
 ('incoherent', -2.7641396677532537),
 ('unfunny', -2.5545257844967644),
 ('waste', -2.4907515123361046),
 ('blah', -2.4475792789485005),
 ('horrid', -2.3715779644809971),
 ('pointless', -2.3451073877136341),
 ('atrocious', -2.3187369339642556),
 ('redeeming', -2.2667790015910296),
 ('prom', -2.2601040980178784),
 ('drivel', -2.2476029585766928),
 ('lousy', -2.2118080125207054),
 ('worst', -2.1930856334332267),
 ('laughable', -2.172468615469592),
 ('awful', -2.1385076866397488),
 ('poorly', -2.1326133844207011),
 ('wasting', -2.1178155545614512),
 ('remotely', -2.111046881095167),
 ('existent', -2.0024805005437076),
 ('boredom', -1.9241486572738005),
 ('miserably', -1.9216610938019989),
 ('sucks', -1.9166645809588516),
 ('uninspired', -1.9131499212248517),
 ('lame', -1.9117232884159072),
 ('insult', -1.9085323769376259)]
In [68]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()
Loading BokehJS ...
In [69]:
hist, edges = np.histogram(list(map(lambda x:x[1],pos_neg_ratios.most_common())), density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Word Positive/Negative Affinity Distribution")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)
In [70]:
frequency_frequency = Counter()

for word, cnt in total_counts.most_common():
    frequency_frequency[cnt] += 1
In [71]:
hist, edges = np.histogram(list(map(lambda x:x[1],frequency_frequency.most_common())), density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="The frequency distribution of the words in our corpus")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

Reducing Noise by Strategically Reducing the Vocabulary

TODO: Improve SentimentNetwork's performance by reducing more noise in the vocabulary. Specifically, do the following:

  • Copy the SentimentNetwork class from the previous project into the following cell.
  • Modify pre_process_data:
    • Add two additional parameters: min_count and polarity_cutoff
    • Calculate the positive-to-negative ratios of words used in the reviews. (You can use code you've written elsewhere in the notebook, but we are moving it into the class like we did with other helper code earlier.)
    • Andrew's solution only calculates a postive-to-negative ratio for words that occur at least 50 times. This keeps the network from attributing too much sentiment to rarer words. You can choose to add this to your solution if you would like.
    • Change so words are only added to the vocabulary if they occur in the vocabulary more than min_count times.
    • Change so words are only added to the vocabulary if the absolute value of their postive-to-negative ratio is at least polarity_cutoff
  • Modify __init__:
    • Add the same two parameters (min_count and polarity_cutoff) and use them when you call pre_process_data

The following code is the same as the previous project, with project-specific changes marked with "New for Project 6"

In [72]:
import time
import sys
import numpy as np

# Encapsulate our neural network in a class
class SentimentNetwork:
    ## New for Project 6: added min_count and polarity_cutoff parameters
    def __init__(self, reviews,labels,min_count = 10,polarity_cutoff = 0.1,hidden_nodes = 10, learning_rate = 0.1):
        """Create a SentimenNetwork with the given settings
        Args:
            reviews(list) - List of reviews used for training
            labels(list) - List of POSITIVE/NEGATIVE labels associated with the given reviews
            min_count(int) - Words should only be added to the vocabulary 
                             if they occur more than this many times
            polarity_cutoff(float) - The absolute value of a word's positive-to-negative
                                     ratio must be at least this big to be considered.
            hidden_nodes(int) - Number of nodes to create in the hidden layer
            learning_rate(float) - Learning rate to use while training
        
        """
        # Assign a seed to our random number generator to ensure we get
        # reproducable results during development 
        np.random.seed(1)

        # process the reviews and their associated labels so that everything
        # is ready for training
        ## New for Project 6: added min_count and polarity_cutoff arguments to pre_process_data call
        self.pre_process_data(reviews, labels, polarity_cutoff, min_count)
        
        # Build the network to have the number of hidden nodes and the learning rate that
        # were passed into this initializer. Make the same number of input nodes as
        # there are vocabulary words and create a single output node.
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

    ## New for Project 6: added min_count and polarity_cutoff parameters
    def pre_process_data(self, reviews, labels, polarity_cutoff, min_count):
        
        ## ----------------------------------------
        ## New for Project 6: Calculate positive-to-negative ratios for words before
        #                     building vocabulary
        #
        positive_counts = Counter()
        negative_counts = Counter()
        total_counts = Counter()

        for i in range(len(reviews)):
            if(labels[i] == 'POSITIVE'):
                for word in reviews[i].split(" "):
                    positive_counts[word] += 1
                    total_counts[word] += 1
            else:
                for word in reviews[i].split(" "):
                    negative_counts[word] += 1
                    total_counts[word] += 1

        pos_neg_ratios = Counter()

        for term,cnt in list(total_counts.most_common()):
            if(cnt >= 50):
                pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
                pos_neg_ratios[term] = pos_neg_ratio

        for word,ratio in pos_neg_ratios.most_common():
            if(ratio > 1):
                pos_neg_ratios[word] = np.log(ratio)
            else:
                pos_neg_ratios[word] = -np.log((1 / (ratio + 0.01)))
        #
        ## end New for Project 6
        ## ----------------------------------------

        # populate review_vocab with all of the words in the given reviews
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                ## New for Project 6: only add words that occur at least min_count times
                #                     and for words with pos/neg ratios, only add words
                #                     that meet the polarity_cutoff
                if(total_counts[word] > min_count):
                    if(word in pos_neg_ratios.keys()):
                        if((pos_neg_ratios[word] >= polarity_cutoff) or (pos_neg_ratios[word] <= -polarity_cutoff)):
                            review_vocab.add(word)
                    else:
                        review_vocab.add(word)

        # Convert the vocabulary set to a list so we can access words via indices
        self.review_vocab = list(review_vocab)
        
        # populate label_vocab with all of the words in the given labels.
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        # Convert the label vocabulary set to a list so we can access labels via indices
        self.label_vocab = list(label_vocab)
        
        # Store the sizes of the review and label vocabularies.
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        # Create a dictionary of words in the vocabulary mapped to index positions
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        # Create a dictionary of labels mapped to index positions
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i

    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Store the learning rate
        self.learning_rate = learning_rate

        # Initialize weights

        # These are the weights between the input layer and the hidden layer.
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))

        # These are the weights between the hidden layer and the output layer.
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        ## New for Project 5: Removed self.layer_0; added self.layer_1
        # The input layer, a two-dimensional matrix with shape 1 x hidden_nodes
        self.layer_1 = np.zeros((1,hidden_nodes))
    
    ## New for Project 5: Removed update_input_layer function
    
    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    ## New for Project 5: changed name of first parameter form 'training_reviews' 
    #                     to 'training_reviews_raw'
    def train(self, training_reviews_raw, training_labels):

        ## New for Project 5: pre-process training reviews so we can deal 
        #                     directly with the indices of non-zero inputs
        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))

        # make sure out we have a matching number of reviews and labels
        assert(len(training_reviews) == len(training_labels))
        
        # Keep track of correct predictions to display accuracy during training 
        correct_so_far = 0

        # Remember when we started for printing time statistics
        start = time.time()
        
        # loop through all the given reviews and run a forward and backward pass,
        # updating weights for every item
        for i in range(len(training_reviews)):
            
            # Get the next review and its correct label
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            ## New for Project 5: Removed call to 'update_input_layer' function
            #                     because 'layer_0' is no longer used

            # Hidden layer
            ## New for Project 5: Add in only the weights for non-zero items
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]

            # Output layer
            ## New for Project 5: changed to use 'self.layer_1' instead of 'local layer_1'
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))            
            
            #### Implement the backward pass here ####
            ### Backward pass ###

            # Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            ## New for Project 5: changed to use 'self.layer_1' instead of local 'layer_1'
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            
            ## New for Project 5: Only update the weights that were used in the forward pass
            for index in review:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update input-to-hidden weights with gradient descent step

            # Keep track of correct predictions.
            if(layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            elif(layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the training process. 
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        """
        Attempts to predict the labels for the given testing_reviews,
        and uses the test_labels to calculate the accuracy of those predictions.
        """
        
        # keep track of how many correct predictions we make
        correct = 0

        # we'll time how many predictions per second we make
        start = time.time()

        # Loop through each of the given reviews and call run to predict
        # its label. 
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        """
        Returns a POSITIVE or NEGATIVE prediction for the given review.
        """
        # Run a forward pass through the network, like in the "train" function.
        
        ## New for Project 5: Removed call to update_input_layer function
        #                     because layer_0 is no longer used

        # Hidden layer
        ## New for Project 5: Identify the indices used in the review and then add
        #                     just those weights to layer_1 
        self.layer_1 *= 0
        unique_indices = set()
        for word in review.lower().split(" "):
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
        
        # Output layer
        ## New for Project 5: changed to use self.layer_1 instead of local layer_1
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
         
        # Return POSITIVE for values above greater-than-or-equal-to 0.5 in the output layer;
        # return NEGATIVE for other values
        if(layer_2[0] >= 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"

Run the following cell to train your network with a small polarity cutoff.

In [73]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.05,learning_rate=0.01)
mlp.train(reviews[:-1000],labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):1307. #Correct:1994 #Trained:2501 Training Accuracy:79.7%
Progress:20.8% Speed(reviews/sec):1281. #Correct:4063 #Trained:5001 Training Accuracy:81.2%
Progress:31.2% Speed(reviews/sec):1284. #Correct:6176 #Trained:7501 Training Accuracy:82.3%
Progress:41.6% Speed(reviews/sec):1288. #Correct:8336 #Trained:10001 Training Accuracy:83.3%
Progress:52.0% Speed(reviews/sec):1286. #Correct:10501 #Trained:12501 Training Accuracy:84.0%
Progress:62.5% Speed(reviews/sec):1287. #Correct:12641 #Trained:15001 Training Accuracy:84.2%
Progress:72.9% Speed(reviews/sec):1283. #Correct:14782 #Trained:17501 Training Accuracy:84.4%
Progress:83.3% Speed(reviews/sec):1279. #Correct:16954 #Trained:20001 Training Accuracy:84.7%
Progress:93.7% Speed(reviews/sec):1276. #Correct:19143 #Trained:22501 Training Accuracy:85.0%
Progress:99.9% Speed(reviews/sec):1275. #Correct:20461 #Trained:24000 Training Accuracy:85.2%

And run the following cell to test it's performance.

In [74]:
mlp.test(reviews[-1000:],labels[-1000:])
Progress:99.9% Speed(reviews/sec):1903. #Correct:859 #Tested:1000 Testing Accuracy:85.9%

Run the following cell to train your network with a much larger polarity cutoff.

In [75]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.8,learning_rate=0.01)
mlp.train(reviews[:-1000],labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):6770. #Correct:2114 #Trained:2501 Training Accuracy:84.5%
Progress:20.8% Speed(reviews/sec):6416. #Correct:4235 #Trained:5001 Training Accuracy:84.6%
Progress:31.2% Speed(reviews/sec):6389. #Correct:6362 #Trained:7501 Training Accuracy:84.8%
Progress:41.6% Speed(reviews/sec):6406. #Correct:8513 #Trained:10001 Training Accuracy:85.1%
Progress:52.0% Speed(reviews/sec):6447. #Correct:10641 #Trained:12501 Training Accuracy:85.1%
Progress:62.5% Speed(reviews/sec):6367. #Correct:12796 #Trained:15001 Training Accuracy:85.3%
Progress:72.9% Speed(reviews/sec):6376. #Correct:14911 #Trained:17501 Training Accuracy:85.2%
Progress:83.3% Speed(reviews/sec):6405. #Correct:17077 #Trained:20001 Training Accuracy:85.3%
Progress:93.7% Speed(reviews/sec):6403. #Correct:19258 #Trained:22501 Training Accuracy:85.5%
Progress:99.9% Speed(reviews/sec):6424. #Correct:20552 #Trained:24000 Training Accuracy:85.6%

And run the following cell to test it's performance.

In [76]:
mlp.test(reviews[-1000:],labels[-1000:])
Progress:99.9% Speed(reviews/sec):6031. #Correct:822 #Tested:1000 Testing Accuracy:82.2%

Analysis: What's Going on in the Weights?

In [77]:
mlp_full = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=0,polarity_cutoff=0,learning_rate=0.01)
In [78]:
mlp_full.train(reviews[:-1000],labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):1063. #Correct:1962 #Trained:2501 Training Accuracy:78.4%
Progress:20.8% Speed(reviews/sec):1041. #Correct:4002 #Trained:5001 Training Accuracy:80.0%
Progress:31.2% Speed(reviews/sec):1042. #Correct:6120 #Trained:7501 Training Accuracy:81.5%
Progress:41.6% Speed(reviews/sec):1047. #Correct:8271 #Trained:10001 Training Accuracy:82.7%
Progress:52.0% Speed(reviews/sec):1047. #Correct:10431 #Trained:12501 Training Accuracy:83.4%
Progress:62.5% Speed(reviews/sec):1047. #Correct:12565 #Trained:15001 Training Accuracy:83.7%
Progress:72.9% Speed(reviews/sec):1043. #Correct:14670 #Trained:17501 Training Accuracy:83.8%
Progress:83.3% Speed(reviews/sec):1041. #Correct:16833 #Trained:20001 Training Accuracy:84.1%
Progress:93.7% Speed(reviews/sec):1039. #Correct:19015 #Trained:22501 Training Accuracy:84.5%
Progress:99.9% Speed(reviews/sec):1039. #Correct:20335 #Trained:24000 Training Accuracy:84.7%
In [79]:
Image(filename='sentiment_network_sparse.png')
Out[79]:
In [80]:
def get_most_similar_words(focus = "horrible"):
    most_similar = Counter()

    for word in mlp_full.word2index.keys():
        most_similar[word] = np.dot(mlp_full.weights_0_1[mlp_full.word2index[word]],mlp_full.weights_0_1[mlp_full.word2index[focus]])
    
    return most_similar.most_common()
In [81]:
get_most_similar_words("excellent")
Out[81]:
[('excellent', 0.1367295075735247),
 ('perfect', 0.12548286087225946),
 ('amazing', 0.091827633925999685),
 ('today', 0.090223662694414244),
 ('wonderful', 0.089355976962214576),
 ('fun', 0.08750446667420686),
 ('great', 0.087141758882292031),
 ('best', 0.085810885617880611),
 ('liked', 0.077697629123843412),
 ('definitely', 0.076628781406966037),
 ('brilliant', 0.073423858769279024),
 ('loved', 0.073285428928122162),
 ('favorite', 0.072781136036160765),
 ('superb', 0.071736207178505054),
 ('fantastic', 0.070922191916266197),
 ('job', 0.06916061720763407),
 ('incredible', 0.066424077952614388),
 ('enjoyable', 0.06563256050288882),
 ('rare', 0.064819212662615075),
 ('highly', 0.063889453350970515),
 ('enjoyed', 0.062127546101812953),
 ('wonderfully', 0.062055178604090148),
 ('perfectly', 0.06109320881188738),
 ('fascinating', 0.060663547937493865),
 ('bit', 0.059655427045653062),
 ('gem', 0.059510859296156786),
 ('outstanding', 0.058860808147083006),
 ('beautiful', 0.058613934703162091),
 ('surprised', 0.058273314482562968),
 ('worth', 0.057657484236471213),
 ('especially', 0.057422020781760785),
 ('refreshing', 0.057310532092265755),
 ('entertaining', 0.056612033835629204),
 ('hilarious', 0.056168541032286662),
 ('masterpiece', 0.054993988649431572),
 ('simple', 0.054484083134924088),
 ('subtle', 0.054368883033508633),
 ('funniest', 0.053457164871302684),
 ('solid', 0.052903564743620651),
 ('awesome', 0.052489194202770408),
 ('always', 0.052260328525345297),
 ('noir', 0.051530194726406894),
 ('guys', 0.051109413645642691),
 ('sweet', 0.05081893031752599),
 ('unique', 0.05067016226358919),
 ('very', 0.050132994948528513),
 ('heart', 0.049948058498243603),
 ('moving', 0.049424601164379113),
 ('atmosphere', 0.048842500895912855),
 ('strong', 0.048570880631759204),
 ('remember', 0.048479036942291262),
 ('believable', 0.048415384391603783),
 ('shows', 0.048336045608039585),
 ('love', 0.047310648160924659),
 ('beautifully', 0.047118717440814903),
 ('both', 0.046957278901480347),
 ('terrific', 0.046686597975756632),
 ('touching', 0.046589962377280962),
 ('fine', 0.046256431328855756),
 ('caught', 0.04616332622478235),
 ('recommended', 0.045876341160885278),
 ('jack', 0.045352909975188323),
 ('everyone', 0.0451452739645994),
 ('episodes', 0.045064457062621285),
 ('classic', 0.044985816637932746),
 ('will', 0.044966672557930465),
 ('appreciate', 0.044764139584570851),
 ('powerful', 0.044176442621852774),
 ('realistic', 0.043597482283464814),
 ('performances', 0.04302024908784173),
 ('human', 0.042657925475092548),
 ('expecting', 0.042588442995212208),
 ('each', 0.042163774519666943),
 ('delightful', 0.041815007170235501),
 ('cry', 0.041750968395934833),
 ('enjoy', 0.041660091797818093),
 ('you', 0.041465994778271079),
 ('surprisingly', 0.041393139256517386),
 ('think', 0.041103720571057073),
 ('performance', 0.040844259420896853),
 ('nice', 0.04001650666693176),
 ('paced', 0.039944488647599627),
 ('true', 0.039750592643370677),
 ('tight', 0.039425438825552647),
 ('similar', 0.039222380170683489),
 ('friendship', 0.039110112764204306),
 ('somewhat', 0.039069615731010261),
 ('beauty', 0.038130922554738794),
 ('short', 0.037981700131409203),
 ('life', 0.037716639265310249),
 ('stunning', 0.037507364832543771),
 ('still', 0.037479827910101508),
 ('normal', 0.037422144669435116),
 ('works', 0.037255830186344194),
 ('appreciated', 0.037156165138066251),
 ('mind', 0.037080739403157773),
 ('twists', 0.036932552473074115),
 ('knowing', 0.036786021801572075),
 ('captures', 0.036467506884494703),
 ('certain', 0.036348359494082834),
 ('later', 0.03621004278676522),
 ('finest', 0.036132101827862653),
 ('compelling', 0.036098464918935771),
 ('others', 0.036090120202196069),
 ('tragic', 0.036005003580472754),
 ('viewing', 0.035933572455522998),
 ('above', 0.035886717849742587),
 ('them', 0.035717513281555764),
 ('matter', 0.035602710619685625),
 ('future', 0.03532377798757342),
 ('good', 0.035250130839512755),
 ('hooked', 0.035154077227307991),
 ('world', 0.035098777806455039),
 ('unexpected', 0.035078442502957774),
 ('innocent', 0.034765360696729204),
 ('tears', 0.034338309927008849),
 ('certainly', 0.034301037742714126),
 ('available', 0.034268101109488004),
 ('unlike', 0.034253988843446576),
 ('season', 0.03403892242701162),
 ('vhs', 0.034011519281018122),
 ('superior', 0.03391762273249576),
 ('unusual', 0.033797799688239365),
 ('genre', 0.033766115408287257),
 ('criminal', 0.033744472720326837),
 ('makes', 0.033587001877476617),
 ('greatest', 0.033431852271975371),
 ('small', 0.033426529870538416),
 ('episode', 0.033336443796849899),
 ('deal', 0.03333610766528191),
 ('now', 0.033283339034235492),
 ('quiet', 0.033147935977529276),
 ('played', 0.033108782201536791),
 ('day', 0.033074949731286572),
 ('moved', 0.032873980754099891),
 ('underrated', 0.032738818192726331),
 ('society', 0.032613580418616249),
 ('focuses', 0.032607333858382825),
 ('intense', 0.032564318613854976),
 ('sharp', 0.032309211040923352),
 ('adds', 0.032236076588351807),
 ('check', 0.032030541149668787),
 ('take', 0.031717140193258615),
 ('deeply', 0.031693099458454575),
 ('games', 0.031663495285720163),
 ('pre', 0.031251131973427125),
 ('change', 0.031183353959862572),
 ('thanks', 0.031172398048464698),
 ('own', 0.031121337943347097),
 ('easy', 0.031088479340529641),
 ('pace', 0.030934361491678216),
 ('parts', 0.030850186028628292),
 ('truly', 0.030836637734471681),
 ('tony', 0.030739434811745025),
 ('inspired', 0.030725453849735001),
 ('thought', 0.030707437377997408),
 ('complex', 0.030464622676702028),
 ('worlds', 0.030391255174782045),
 ('language', 0.03026497620030957),
 ('soundtrack', 0.030210032139046033),
 ('steals', 0.030207167115964776),
 ('glad', 0.029812003262142277),
 ('ride', 0.029801794809751706),
 ('came', 0.029760628313031522),
 ('impact', 0.029695785634015856),
 ('personally', 0.029677477012254878),
 ('gritty', 0.029540021762614992),
 ('effective', 0.029512382123355364),
 ('wise', 0.029510408701830342),
 ('ultimate', 0.029442440672320928),
 ('ways', 0.02943934179284419),
 ('well', 0.029238386207701292),
 ('sent', 0.029147924396380084),
 ('after', 0.029037668915531275),
 ('tells', 0.029004383695691471),
 ('along', 0.028932972901634903),
 ('modern', 0.028910642159349326),
 ('family', 0.028897380662865537),
 ('pleasantly', 0.028754280601052389),
 ('edge', 0.028744687476241267),
 ('american', 0.028706398764554431),
 ('england', 0.028640930969798119),
 ('grand', 0.028581102406371937),
 ('slowly', 0.02847032891292298),
 ('treat', 0.028418097520915953),
 ('pleasure', 0.028370704112004173),
 ('living', 0.028335845213660421),
 ('impressed', 0.028311856507726538),
 ('fans', 0.028234674336798979),
 ('suspenseful', 0.028156658725541156),
 ('smile', 0.028065651834597621),
 ('jim', 0.027910842672277558),
 ('saw', 0.027900239466183013),
 ('length', 0.027896431301274525),
 ('impressive', 0.02789477824336279),
 ('times', 0.027869981332762576),
 ('witty', 0.027809121334036409),
 ('flawless', 0.02767640930293911),
 ('magic', 0.027671001404746008),
 ('though', 0.027434087841071542),
 ('subtitles', 0.027431981179380463),
 ('stands', 0.027348518548416436),
 ('freedom', 0.027271908118037393),
 ('relationship', 0.027231146375769122),
 ('tape', 0.027213179198573845),
 ('apartment', 0.027198859160910003),
 ('shown', 0.027062169058709867),
 ('films', 0.027035590529373463),
 ('lot', 0.026934527370476347),
 ('barbara', 0.026837141036193592),
 ('office', 0.026775230449656295),
 ('damn', 0.026751196837598839),
 ('murder', 0.026709073212876629),
 ('brilliantly', 0.026701889741880664),
 ('learns', 0.026699872569574585),
 ('tends', 0.02668377436133575),
 ('complaint', 0.026587011626106865),
 ('themselves', 0.026524658938498962),
 ('war', 0.026518675436425335),
 ('violence', 0.02645062815807615),
 ('judge', 0.026443267774947349),
 ('thriller', 0.026431555027632107),
 ('his', 0.026370773394088588),
 ('finding', 0.026362279892885008),
 ('cast', 0.026360860883736632),
 ('police', 0.02635212945330526),
 ('once', 0.026255817642908227),
 ('spectacular', 0.026245466997092369),
 ('deserves', 0.026214508159961698),
 ('driven', 0.026194930792511634),
 ('spot', 0.026171686780563679),
 ('carrey', 0.026162838804053023),
 ('negative', 0.026161677045062212),
 ('suspense', 0.026110016575822799),
 ('flaws', 0.026085421601700295),
 ('brave', 0.026080835779725281),
 ('surprising', 0.026070851171974718),
 ('gives', 0.026069978044960768),
 ('takes', 0.026047493401813337),
 ('light', 0.025921067904644504),
 ('timing', 0.025900303450693638),
 ('crime', 0.025886011572638656),
 ('thank', 0.025873161609513355),
 ('century', 0.025871056310112637),
 ('until', 0.025870245942132521),
 ('nature', 0.02581794293587544),
 ('stellar', 0.025803971141651161),
 ('emotions', 0.025783809728671926),
 ('tremendous', 0.025772614605786566),
 ('missed', 0.025657501028952572),
 ('overall', 0.02565565248510179),
 ('haven', 0.025650692177140794),
 ('portrayal', 0.025594273657909641),
 ('taylor', 0.025516992710898172),
 ('appropriate', 0.025495908849901643),
 ('joan', 0.025489829859140632),
 ('realize', 0.025452457061382175),
 ('different', 0.025434073970060436),
 ('return', 0.025384569542597574),
 ('bound', 0.025380084410398834),
 ('noticed', 0.025306494998440777),
 ('constantly', 0.025282186745762471),
 ('first', 0.02524610088891982),
 ('lovable', 0.025213500492273062),
 ('comic', 0.025074597800944062),
 ('scared', 0.024995376513809515),
 ('fight', 0.0249432099458364),
 ('extraordinary', 0.024940366453083618),
 ('buy', 0.024803940824255574),
 ('know', 0.024749519416087051),
 ('brothers', 0.024675058346350739),
 ('action', 0.024660907824635248),
 ('needs', 0.024634851651549348),
 ('jerry', 0.02462148438534386),
 ('while', 0.024620233313683841),
 ('also', 0.02451948098747242),
 ('definite', 0.024509585305468838),
 ('genius', 0.024500478757646958),
 ('tragedy', 0.024481339186882271),
 ('heard', 0.024446567944460481),
 ('haunting', 0.024431007352898923),
 ('legendary', 0.024412777264908966),
 ('uses', 0.024358972452014009),
 ('years', 0.024316094895735264),
 ('notch', 0.024310571597216273),
 ('fabulous', 0.024258810824927632),
 ('herself', 0.024241390957491043),
 ('battle', 0.024205827940178126),
 ('ralph', 0.024205046194653305),
 ('provoking', 0.02410610606248181),
 ('ago', 0.024024541904156496),
 ('game', 0.024004541901512366),
 ('deals', 0.023947020249030993),
 ('themes', 0.023936597120221125),
 ('my', 0.023928374753346023),
 ('which', 0.023908264765228733),
 ('together', 0.023887683942808231),
 ('record', 0.023879473557965516),
 ('chilling', 0.023877413677317428),
 ('absorbing', 0.023848541510400115),
 ('studios', 0.023840610970325322),
 ('helps', 0.023800338082370951),
 ('paul', 0.023782537407117964),
 ('drama', 0.023766688862014728),
 ('spots', 0.023727534480488404),
 ('japanese', 0.023708475430511473),
 ('com', 0.023663537310393366),
 ('meets', 0.023649415936523161),
 ('may', 0.023577512715288875),
 ('goal', 0.023571992449256601),
 ('out', 0.02355875377346511),
 ('page', 0.023530160671184866),
 ('con', 0.023523200814540519),
 ('thankfully', 0.023405004970711702),
 ('number', 0.023389568775323534),
 ('captured', 0.023351056068531214),
 ('joy', 0.023338854638575431),
 ('brought', 0.023336907813285939),
 ('max', 0.023250909447975868),
 ('superbly', 0.023239871167515601),
 ('those', 0.023176845007530648),
 ('course', 0.02317012830505652),
 ('inspiring', 0.02312494046982002),
 ('troubled', 0.02310455328814329),
 ('starring', 0.023098181939380305),
 ('famous', 0.023080990484234922),
 ('nowadays', 0.023041214534459811),
 ('gripping', 0.023039160339941949),
 ('identity', 0.023038352369265179),
 ('many', 0.023030059748964167),
 ('victor', 0.023028627724258646),
 ('michael', 0.022946522358330855),
 ('stop', 0.022927047859442076),
 ('eerie', 0.022877301562370823),
 ('seen', 0.022820929217422643),
 ('caused', 0.022791670672167523),
 ('moment', 0.022789062338184285),
 ('portraying', 0.022729334983088958),
 ('influence', 0.022698569029077079),
 ('when', 0.022541791159242774),
 ('touched', 0.022525639292270219),
 ('complicated', 0.022432126566344645),
 ('turns', 0.022415566693423844),
 ('young', 0.022415228068632002),
 ('award', 0.022414761392271613),
 ('put', 0.022325849008177179),
 ('trust', 0.022301497663936395),
 ('issues', 0.022257753376187493),
 ('innocence', 0.022236928993752816),
 ('anime', 0.022201683728338903),
 ('without', 0.022144543987858849),
 ('himself', 0.022068240705874407),
 ('charlie', 0.022052037301460173),
 ('parents', 0.021888138202371753),
 ('covered', 0.02188753333796175),
 ('final', 0.021877215769079514),
 ('killers', 0.021830664900395112),
 ('ages', 0.021774376677575587),
 ('usual', 0.021760980512718141),
 ('physical', 0.021749103191221815),
 ('like', 0.021730991541426756),
 ('crazy', 0.021727382570242999),
 ('puts', 0.021725737321791526),
 ('got', 0.021701574500289107),
 ('room', 0.021690968569465618),
 ('complaints', 0.021670426593916561),
 ('type', 0.02166362898294516),
 ('brings', 0.021600600975875413),
 ('remarkable', 0.021576791719396034),
 ('get', 0.021538325389801358),
 ('city', 0.021523385378314889),
 ('coming', 0.021492351614142785),
 ('traditional', 0.021430875828269805),
 ('romantic', 0.021420587536168555),
 ('cinema', 0.021411776829230973),
 ('regular', 0.021395882255575847),
 ('intelligent', 0.021391350897315441),
 ('music', 0.021381013806527439),
 ('humor', 0.021365697759571502),
 ('experience', 0.021314525649372928),
 ('favourite', 0.021253476483878254),
 ('social', 0.021250085255237368),
 ('feelings', 0.021245030895714362),
 ('cried', 0.02123327164107075),
 ('rock', 0.021213280029832367),
 ('against', 0.021157314119587267),
 ('including', 0.021156674122491392),
 ('honest', 0.021143458758793494),
 ('parallel', 0.021107353247706462),
 ('eddie', 0.021080182147252727),
 ('crafted', 0.020979194953745083),
 ('more', 0.020933797343193814),
 ('glued', 0.02093198872193016),
 ('insanity', 0.020914935599101157),
 ('thoroughly', 0.020905661542252783),
 ('eyes', 0.020868013291281087),
 ('jr', 0.020865268971014529),
 ('dramas', 0.020836398428109228),
 ('follows', 0.020814937146708415),
 ('situation', 0.020814821105666476),
 ('understood', 0.020749677092470178),
 ('face', 0.020701739464945058),
 ('albeit', 0.020680340389878413),
 ('memorable', 0.020608260124115516),
 ('accurate', 0.020585303033408751),
 ('under', 0.020574430698374249),
 ('arthur', 0.020562083939889477),
 ('elderly', 0.020545350471808114),
 ('opinion', 0.020539570922797755),
 ('whoopi', 0.020515675744150072),
 ('helped', 0.020476242337130524),
 ('detract', 0.020443807698341681),
 ('flawed', 0.020436371691432333),
 ('unusually', 0.02043352383590533),
 ('performing', 0.020396957567555725),
 ('smooth', 0.020347681451465375),
 ('magnificent', 0.020334637688102845),
 ('desperation', 0.020287768999057237),
 ('lose', 0.020277535683257859),
 ('satisfying', 0.020251527110272057),
 ('friend', 0.020227651020398935),
 ('kudos', 0.02020147732692662),
 ('breaking', 0.020117861519854296),
 ('elephant', 0.020115783447057049),
 ('colors', 0.020112155987764876),
 ('willing', 0.020087728040224333),
 ('fresh', 0.02005401912359376),
 ('offers', 0.020003415308141068),
 ('provides', 0.020002909565985022),
 ('guilt', 0.019987917970659564),
 ('shouldn', 0.019907879458024368),
 ('japan', 0.019906368589571687),
 ('secrets', 0.019876976104814401),
 ('obligatory', 0.019789665431840416),
 ('dvd', 0.019782796187823446),
 ('tale', 0.019752149872839877),
 ('since', 0.019726258912690295),
 ('roles', 0.019710495505207995),
 ('breathtaking', 0.019705824135660535),
 ('ground', 0.019687236524961883),
 ('higher', 0.019670526139537559),
 ('jean', 0.019665400087401592),
 ('rich', 0.019653095716660723),
 ('right', 0.01962929358043573),
 ('stone', 0.0196105959056691),
 ('lives', 0.019610348936710136),
 ('it', 0.019542002303277566),
 ('essential', 0.019533860093920413),
 ('tend', 0.019523404457496816),
 ('places', 0.019510216587218025),
 ('recommend', 0.019506211559818118),
 ('loy', 0.019481148560970923),
 ('tell', 0.019450286669268763),
 ('challenge', 0.019374490591710931),
 ('fiction', 0.019350601498735374),
 ('able', 0.019340445094151441),
 ('animated', 0.019333069625267079),
 ('complain', 0.019332028796550105),
 ('deeper', 0.019318681931941171),
 ('blew', 0.019304454395430132),
 ('seeing', 0.019302442445035515),
 ('release', 0.019209904006239131),
 ('unfolds', 0.019184703456013676),
 ('boys', 0.019177414753158397),
 ('favorites', 0.01916037814148952),
 ('throughout', 0.01913689284569068),
 ('marvelous', 0.019110015321943574),
 ('relax', 0.019044075162625462),
 ('desire', 0.019016117204605987),
 ('end', 0.019014420138293207),
 ('questions', 0.018977699968684845),
 ('man', 0.018956744494720245),
 ('rea', 0.018928733395777456),
 ('comments', 0.018923870708363082),
 ('vengeance', 0.018908638777923942),
 ('brian', 0.01890687632302359),
 ('learned', 0.01889994792370445),
 ('lovely', 0.018854980464698634),
 ('seasons', 0.018852496578683826),
 ('shines', 0.018827509959493269),
 ('justice', 0.018827310862034655),
 ('succeeds', 0.018776998522312776),
 ('discovered', 0.018766802216817056),
 ('touch', 0.018762806738861479),
 ('white', 0.018743225697414184),
 ('bitter', 0.018724701999912892),
 ('knows', 0.01871906328874428),
 ('gene', 0.018660060796556237),
 ('mainstream', 0.018654252436913901),
 ('raw', 0.018609728881254829),
 ('focus', 0.018605078305494918),
 ('won', 0.018597537876871652),
 ('ve', 0.018560162581379321),
 ('million', 0.018514133006256917),
 ('attention', 0.018406547682637137),
 ('river', 0.018403383531225701),
 ('classics', 0.018375185367387355),
 ('quirky', 0.01835810053575461),
 ('although', 0.01835025297382192),
 ('september', 0.018345012211358886),
 ('emotional', 0.01832716507095174),
 ('events', 0.018324554475918117),
 ('released', 0.018304767183625541),
 ('thus', 0.018302709016086088),
 ('rules', 0.018298967789718682),
 ('trilogy', 0.0182619859222885),
 ('jackie', 0.018261017705562568),
 ('country', 0.018248984107628787),
 ('find', 0.018220001120247332),
 ('sure', 0.018205281970545901),
 ('overlooked', 0.018173644592107383),
 ('sensitive', 0.018173518786609156),
 ('harsh', 0.018143998075916403),
 ('chair', 0.018127987063468097),
 ('neatly', 0.018123044612179437),
 ('round', 0.018082305853658363),
 ('adult', 0.018060718859389525),
 ('strength', 0.018042558269708926),
 ('aunt', 0.018028313353173661),
 ('description', 0.01799755734083397),
 ('perspective', 0.01797476119333969),
 ('closer', 0.01794506642390805),
 ('extra', 0.017934760731343122),
 ('hit', 0.017910740181690345),
 ('tough', 0.01790450947037624),
 ('work', 0.017882494289916076),
 ('captivating', 0.017875072308920943),
 ('swim', 0.01785335427201485),
 ('holmes', 0.017846058193393122),
 ('unlikely', 0.017843839699452125),
 ('fears', 0.017838067451752794),
 ('nominated', 0.017837439304520593),
 ('neat', 0.01782306847491319),
 ('discovers', 0.017801301834152457),
 ('paris', 0.017798057884200084),
 ('streets', 0.0177461474805976),
 ('realism', 0.017729724930388047),
 ('travel', 0.017694257020940282),
 ('keep', 0.017684400089090113),
 ('anyway', 0.017675995400919457),
 ('realizes', 0.017618932935696153),
 ('variety', 0.017618487604827645),
 ('chief', 0.017603963834362819),
 ('broke', 0.017601657476194955),
 ('craven', 0.017597613499935327),
 ('moves', 0.017559744221771683),
 ('see', 0.017554713803040186),
 ('intellectual', 0.017537349329235116),
 ('normally', 0.017511237908563512),
 ('technique', 0.01750226507783019),
 ('dancer', 0.017501395365645257),
 ('awe', 0.017467446640641388),
 ('technology', 0.017414969148737192),
 ('kelly', 0.017380794671638271),
 ('particular', 0.017380503339109239),
 ('awards', 0.01734306737430508),
 ('twisted', 0.017342731655512204),
 ('manager', 0.017337683585341681),
 ('fantasy', 0.017314736380004709),
 ('blake', 0.017282963990552191),
 ('criticism', 0.017279558676803683),
 ('identify', 0.017277471199843665),
 ('collection', 0.017253533052260933),
 ('sidney', 0.017239120845031552),
 ('ironic', 0.017225809884120889),
 ('score', 0.017223046869263507),
 ('charm', 0.017204164112517881),
 ('lonely', 0.017192972607511965),
 ('recall', 0.017189512282670277),
 ('dream', 0.017185607849471315),
 ('known', 0.017169341473045802),
 ('hoffman', 0.017123937023014242),
 ('answers', 0.017112374531695257),
 ('taking', 0.017102244694823326),
 ('color', 0.017086755659474453),
 ('existed', 0.017084491834780034),
 ('mel', 0.017080644125498479),
 ('treats', 0.017076365809061671),
 ('kennedy', 0.017063054110179412),
 ('millionaire', 0.017058120181534072),
 ('stewart', 0.01701786393539512),
 ('soon', 0.017016949690113498),
 ('style', 0.016978446616527421),
 ('urban', 0.016961773741888564),
 ('sides', 0.016958377563876279),
 ('nicely', 0.01695658404466506),
 ('survive', 0.016953201066203551),
 ('contrast', 0.016949017788907696),
 ('granted', 0.016948500759420799),
 ('wes', 0.016856895803564042),
 ('heroic', 0.016849533387674569),
 ('sadness', 0.016836182986070536),
 ('faults', 0.016833966998505413),
 ('ladies', 0.016818146836646248),
 ('walter', 0.0168136452096148),
 ('exceptional', 0.01681024298533729),
 ('dangerous', 0.016796058008032445),
 ('fan', 0.016737120507724371),
 ('witch', 0.016717085914917346),
 ('occasionally', 0.016711349636820475),
 ('movies', 0.01667668795406364),
 ('celebration', 0.016664197566723736),
 ('castle', 0.016661909651854552),
 ('catch', 0.016647995152024704),
 ('its', 0.016639302941262302),
 ('tribute', 0.016629617927918804),
 ('jimmy', 0.016625132101972973),
 ('bravo', 0.016616754156460037),
 ('enjoying', 0.016613140144305663),
 ('bus', 0.016593157501778106),
 ('documentary', 0.016564651461285374),
 ('frightening', 0.016559987706802764),
 ('guilty', 0.016536110253664232),
 ('slightly', 0.016526421724199342),
 ('is', 0.016511509443399769),
 ('chan', 0.016507204515006656),
 ('mixed', 0.016506847567311404),
 ('curious', 0.016506488394564568),
 ('spirit', 0.016502977044099098),
 ('pleased', 0.016487261129390262),
 ('most', 0.016476759333214089),
 ('chemistry', 0.016425356343989065),
 ('age', 0.016410666314929899),
 ('understanding', 0.01634569620294557),
 ('marie', 0.016341053241072698),
 ('dreams', 0.016332672013556312),
 ('again', 0.016287090973937754),
 ('union', 0.016282379359022558),
 ('spy', 0.016278154923785922),
 ('presented', 0.016273043238663475),
 ('steele', 0.0162609933390068),
 ('lay', 0.01625999545879786),
 ('plenty', 0.016247194189832843),
 ('horrors', 0.016246022980305592),
 ('black', 0.016223176851856813),
 ('comedy', 0.0162204080220106),
 ('winner', 0.016220318857398407),
 ('african', 0.01621445660979496),
 ('drummer', 0.016178152199513931),
 ('entertainment', 0.016173112007890969),
 ('delivers', 0.016166599465683076),
 ('stays', 0.016139476352793777),
 ('america', 0.016108896341111505),
 ('disappoint', 0.016066615933996442),
 ('gorgeous', 0.016062350166815047),
 ('sisters', 0.016060080355840684),
 ('subsequent', 0.016043574203873964),
 ('cerebral', 0.016039058904070029),
 ('french', 0.016038425317363183),
 ('perfection', 0.016033154869346922),
 ('likable', 0.016021713396124574),
 ('warm', 0.016019144095827349),
 ('studio', 0.01600723281846457),
 ('late', 0.015997923350457056),
 ('reality', 0.015978872249423726),
 ('showed', 0.015938750644323936),
 ('figures', 0.01592744660892324),
 ('ever', 0.015926454600790653),
 ('italy', 0.015909186780479364),
 ('accustomed', 0.015906246911558283),
 ('into', 0.015892173681617969),
 ('he', 0.015866239932092345),
 ('journey', 0.015817191390925515),
 ('waters', 0.015800906878826314),
 ('bill', 0.01578597614879133),
 ('cousin', 0.015784382710801671),
 ('explores', 0.015768756345569585),
 ('originally', 0.015766016465315408),
 ('astonishing', 0.015741175347778351),
 ('mouse', 0.015739473070555069),
 ('affect', 0.015719798460443277),
 ('authenticity', 0.015716491136675288),
 ('key', 0.015706372736941254),
 ('authorities', 0.015700111946298504),
 ('fortunately', 0.015676427069879845),
 ('notes', 0.015668388567765472),
 ('disagree', 0.01565982223146424),
 ('advanced', 0.015653464856497622),
 ('contribution', 0.015651919381489542),
 ('flaw', 0.01563062317548556),
 ('burning', 0.015593951152590374),
 ('scoop', 0.0155809110142135),
 ('levels', 0.015579506047588178),
 ('dead', 0.015575945832152265),
 ('reveals', 0.015552631094426428),
 ('explicit', 0.01553505254238324),
 ('fault', 0.015532818014787654),
 ('requires', 0.015440001642516233),
 ('way', 0.015434313286947589),
 ('waitress', 0.015433929845739229),
 ('vividly', 0.015399209375312223),
 ('truman', 0.015388667015530336),
 ('leslie', 0.015388355420398656),
 ('cool', 0.015362419182461),
 ('i', 0.015358846209804477),
 ('dated', 0.015351894934707866),
 ('ruthless', 0.015347223840634989),
 ('anymore', 0.015327840988573722),
 ('batman', 0.015325445892906487),
 ('york', 0.015323650797282724),
 ('expressions', 0.015290943599335203),
 ('terms', 0.015285161966075789),
 ('sunday', 0.015279982329904815),
 ('chinese', 0.015240680418926652),
 ('done', 0.015230733309302686),
 ('behind', 0.015219079842199827),
 ('event', 0.015214794169662833),
 ('chamberlain', 0.015214082741427186),
 ('mysteries', 0.015204556759409918),
 ('manages', 0.01520348693463202),
 ('simpsons', 0.01519184981292622),
 ('mine', 0.015191085212402708),
 ('canadian', 0.015117611742208801),
 ('purple', 0.015100505661562466),
 ('website', 0.015095063701722861),
 ('master', 0.015091528696557648),
 ('charming', 0.015088362486196537),
 ('joe', 0.015081920177878148),
 ('reservations', 0.015077821343474089),
 ('fever', 0.015076873583983725),
 ('covers', 0.0150472334532588),
 ('madness', 0.015030361859657218),
 ('glimpse', 0.014991086926970954),
 ('pilot', 0.014978443271049673),
 ('johansson', 0.014975808461544404),
 ('explains', 0.014970512080227472),
 ('excellently', 0.014970388571598842),
 ('hawke', 0.014969750109931361),
 ('genuinely', 0.014947672770702573),
 ('often', 0.014942833143544474),
 ('cube', 0.014939928709365363),
 ('clean', 0.014937853229023509),
 ('ensemble', 0.014913656909087879),
 ('referred', 0.014910582069880154),
 ('replies', 0.014907131594945562),
 ('disease', 0.014895193110452176),
 ('wish', 0.014892245549307041),
 ('logical', 0.014888665766304059),
 ('nathan', 0.014869928851670398),
 ('aware', 0.014869867112894513),
 ('exciting', 0.0148231396949806),
 ('gone', 0.014821497224651529),
 ('critics', 0.014818559383907354),
 ('split', 0.014788117032985616),
 ('series', 0.014770708703162185),
 ('henry', 0.014757735101897441),
 ('prisoners', 0.014747710184003867),
 ('sentenced', 0.014746219906503842),
 ('laughing', 0.014722151818909788),
 ('president', 0.014671766779490529),
 ('list', 0.014666775185665162),
 ('ones', 0.014658997854109325),
 ('information', 0.014651687169784237),
 ('bonus', 0.014648059891508169),
 ('chicago', 0.014631769872667609),
 ('someday', 0.01462934047526257),
 ('splendid', 0.014609703424340656),
 ('surprises', 0.014608824054662472),
 ('sentimental', 0.014591361045287948),
 ('admit', 0.014588098910742794),
 ('previously', 0.014571223247118624),
 ('conveys', 0.014567143509152133),
 ('prominent', 0.014547363114083271),
 ('born', 0.014536990751946696),
 ('necessary', 0.014533225697989446),
 ('yes', 0.014531704633026972),
 ('marvel', 0.014527554209112411),
 ('initially', 0.014510187714555971),
 ('jake', 0.014502509408478864),
 ('matters', 0.014497730426084213),
 ('lucas', 0.014496736417950701),
 ('stories', 0.014475382661229963),
 ('happy', 0.014471040644253804),
 ('improvement', 0.014459225025278404),
 ('anger', 0.014440696969299303),
 ('hong', 0.014412020732763238),
 ('devotion', 0.014406165594180762),
 ('infamous', 0.014402483161136853),
 ('sir', 0.01439058584994257),
 ('fashioned', 0.014376495163092876),
 ('whenever', 0.014311984840844736),
 ('facing', 0.014311813694297491),
 ('spin', 0.014300937890947244),
 ('clear', 0.014297831903635034),
 ('verhoeven', 0.014290838087095132),
 ('onto', 0.014287704198288407),
 ('sheriff', 0.014266680346279271),
 ('boy', 0.014238393212172502),
 ('felix', 0.014236371593101718),
 ('what', 0.014231196728127875),
 ('site', 0.014212839329217037),
 ('hits', 0.014208508715996916),
 ('convincingly', 0.014165838532387462),
 ('adventures', 0.014158492204346299),
 ('multiple', 0.014150723728410534),
 ('wrapped', 0.01411875910345912),
 ('reveal', 0.01407651065382279),
 ('toby', 0.01407522149311176),
 ('months', 0.014061986005374697),
 ('comedies', 0.014050301808876077),
 ('shot', 0.014031987455271904),
 ('holds', 0.014023504904484219),
 ('weeks', 0.014002257803042333),
 ('window', 0.01398543454161485),
 ('received', 0.013983301709629933),
 ('him', 0.013968181093938298),
 ('court', 0.013964352058193517),
 ('double', 0.013960483190947278),
 ('refuses', 0.013957613385590657),
 ('stand', 0.013948813859221362),
 ('shocked', 0.013935157243261925),
 ('powell', 0.01393406244197702),
 ('brutal', 0.013924129605946696),
 ('among', 0.013913156765292941),
 ('prostitute', 0.013911765274631798),
 ('nine', 0.013882343344720908),
 ('timeless', 0.01385827439549941),
 ('likes', 0.013844971514262238),
 ('kurosawa', 0.013820064338774899),
 ('fact', 0.013814297186034396),
 ('ass', 0.01381389978194981),
 ('deanna', 0.013799520782801155),
 ('almost', 0.013791517357271348),
 ('technicolor', 0.013790541990858999),
 ('adventure', 0.013782999907047074),
 ('gerard', 0.013776140434137598),
 ('analysis', 0.013764039325045373),
 ('mid', 0.013747853289146217),
 ('stanwyck', 0.013738927891779256),
 ('mann', 0.013726915645691879),
 ('stuart', 0.013700229069235785),
 ('reluctantly', 0.013697113976504027),
 ('humanity', 0.013690830736911042),
 ('classical', 0.013688949911986577),
 ('health', 0.013684784640613448),
 ('edie', 0.013683859176013937),
 ('british', 0.013666460250876464),
 ('primary', 0.013661794714033903),
 ('coaster', 0.013660631014138402),
 ('explore', 0.013656042478726911),
 ('china', 0.013638756081011158),
 ('advantage', 0.013631698822745392),
 ('protagonists', 0.013627593648932788),
 ('partly', 0.013617059618125366),
 ('artist', 0.013597123465502837),
 ('terrifying', 0.013581203319898146),
 ('scarlett', 0.013567078625941564),
 ('mesmerizing', 0.01354781689947941),
 ('prince', 0.013541105943095589),
 ('weird', 0.013535346249579566),
 ('vance', 0.013518150392608123),
 ('collect', 0.013513303578887656),
 ('humour', 0.01350889016667799),
 ('doc', 0.013507286431402926),
 ('history', 0.01350612020078829),
 ('miss', 0.013498187990897418),
 ('angles', 0.013497507265665436),
 ('dealers', 0.013493607234383897),
 ('mass', 0.013472328625932875),
 ('paramount', 0.013467546662344529),
 ('musicians', 0.013464517138686275),
 ('jackman', 0.013441428735872101),
 ('cheer', 0.013440230376864147),
 ('aired', 0.013427957547366863),
 ('personal', 0.01342241888767008),
 ('become', 0.013415910991211791),
 ('wang', 0.013406655764270569),
 ('unforgettable', 0.013405651085753994),
 ('theme', 0.013397995857105537),
 ('satisfy', 0.013361012634637442),
 ('beginning', 0.013353575498360091),
 ('tongue', 0.013332587937334748),
 ('ran', 0.013322580056022434),
 ('vh', 0.01332169486224734),
 ('april', 0.013317958082689027),
 ('cracking', 0.013316482654851882),
 ('hilariously', 0.013312111975215816),
 ('addictive', 0.013304056341282523),
 ('factory', 0.013302408850101533),
 ('bloom', 0.013287106893282026),
 ('outcome', 0.013278893812795739),
 ('startling', 0.013276469703553513),
 ('portrait', 0.013273055100999263),
 ('adapted', 0.013258514308676845),
 ('raines', 0.013257908724754864),
 ('sky', 0.013252502620889896),
 ('earlier', 0.01323311074363257),
 ('atlantis', 0.013228188610144563),
 ('delirious', 0.013226874818125445),
 ('titanic', 0.013205633401144468),
 ('nevertheless', 0.013198200611184941),
 ('proved', 0.013189760358384478),
 ('denzel', 0.013188430841614765),
 ('pleasant', 0.013180077348723354),
 ('horses', 0.013178651568029467),
 ('about', 0.013166154528006851),
 ('astounding', 0.01316169833722681),
 ('savage', 0.013154100553759924),
 ('winning', 0.013153246708379671),
 ('rose', 0.013145586701309787),
 ('fitting', 0.013133578254330345),
 ('compared', 0.013131693803520044),
 ('took', 0.013119343481498973),
 ('masterson', 0.013112762074217889),
 ('owner', 0.013108690454819138),
 ('delight', 0.013107278788311012),
 ('conventions', 0.013106039770696055),
 ('natali', 0.013094964441143218),
 ('message', 0.013093664295113428),
 ('stood', 0.013090122718303435),
 ('sailor', 0.01305895917042345),
 ('ida', 0.013058842950256239),
 ('escaping', 0.013052723624706782),
 ('top', 0.01304746674102443),
 ('louis', 0.013046238442637016),
 ('peace', 0.013040907918892312),
 ('several', 0.013028244887060284),
 ('info', 0.01302375462555018),
 ('graphics', 0.013020850288881863),
 ('reflection', 0.013019243823940109),
 ('slimy', 0.013014377070231843),
 ('elvira', 0.013009811638957066),
 ('andre', 0.01300004731344674),
 ('kong', 0.012999080313300517),
 ('mayor', 0.012994758409723568),
 ('punishment', 0.01298826494961494),
 ('morris', 0.012983710119604967),
 ('hall', 0.012981593609354809),
 ('match', 0.012980233583057334),
 ('bleak', 0.01297250508630407),
 ('lindy', 0.012972248933121261),
 ('sequence', 0.012964435808713572),
 ('learn', 0.012938848970083352),
 ('happen', 0.01293283638787374),
 ('john', 0.012929524979001681),
 ('gothic', 0.012926957011734873),
 ('wider', 0.012920985981480957),
 ('popular', 0.012891690509844067),
 ('diverse', 0.012875263936567807),
 ('compare', 0.012869395292065204),
 ('brooklyn', 0.01285298624326393),
 ('broadcast', 0.012839574692097616),
 ('zane', 0.01283430295770914),
 ('andrew', 0.012824020940615254),
 ('finely', 0.012822716004015858),
 ('confronted', 0.012817523686608632),
 ('going', 0.012809762839304982),
 ('likewise', 0.012804639349082518),
 ('breath', 0.0127901326594179),
 ('building', 0.012789809704793877),
 ('suggesting', 0.012780624321169351),
 ('contemporary', 0.012772749462937506),
 ('midnight', 0.012766963563112082),
 ('victoria', 0.012756422131580528),
 ('lasting', 0.012752424415642586),
 ('kitty', 0.012751468371946007),
 ('continued', 0.01274432545648539),
 ('indian', 0.012712962842718686),
 ('subplots', 0.012709887814283902),
 ('douglas', 0.012693830679455889),
 ('explosions', 0.012692697593201848),
 ('bond', 0.012689802823687823),
 ('delightfully', 0.012669417460922622),
 ('understated', 0.012669374312789342),
 ('greater', 0.012664580396020165),
 ('sailing', 0.012662424581282425),
 ('images', 0.01266180304885987),
 ('copy', 0.012624649645734161),
 ('seat', 0.012610464273152509),
 ('eleven', 0.012602533659978888),
 ('riveting', 0.012591829460094517),
 ('boiled', 0.012588863529638759),
 ('academy', 0.012581996178142974),
 ('whilst', 0.012569841653295642),
 ('heaven', 0.012547361621330914),
 ('fruit', 0.012543513029693249),
 ('reviewer', 0.012534273375083886),
 ('cost', 0.012529643005796615),
 ('week', 0.012522845015008281),
 ('intriguing', 0.012508687653306356),
 ('streak', 0.012507752385208562),
 ('san', 0.012502130058217934),
 ('awareness', 0.012476446442012451),
 ('catching', 0.012467108595451522),
 ('kicks', 0.012457714930570581),
 ('complexities', 0.012454362663082464),
 ('draws', 0.012447753285125917),
 ('easily', 0.012444885855614875),
 ('ealing', 0.012444339255708925),
 ('psychopath', 0.012431259926282268),
 ('skin', 0.012424248540973577),
 ('creative', 0.012386713452491529),
 ('recognition', 0.012354025801439416),
 ('downey', 0.012348698765161131),
 ('symbolism', 0.012329925038271331),
 ('touches', 0.01232801347075147),
 ('everyday', 0.012324934809895893),
 ('achieves', 0.012314898707483495),
 ('outcast', 0.012313662230219678),
 ('overwhelmed', 0.012306633138869474),
 ...]
In [82]:
get_most_similar_words("terrible")
Out[82]:
[('worst', 0.16966107259049848),
 ('awful', 0.12026847019691246),
 ('waste', 0.11945367265311006),
 ('poor', 0.092758887574435497),
 ('terrible', 0.09142538719772797),
 ('dull', 0.084209271678223618),
 ('poorly', 0.081241544516042027),
 ('disappointment', 0.08006475962136872),
 ('fails', 0.07859977372333754),
 ('disappointing', 0.077339485480323364),
 ('boring', 0.077127858748012895),
 ('unfortunately', 0.075502449705859079),
 ('worse', 0.070601835364194676),
 ('mess', 0.070564299623590426),
 ('stupid', 0.06948482283254305),
 ('badly', 0.066888903666228586),
 ('annoying', 0.065687021903374165),
 ('bad', 0.063093814537572138),
 ('save', 0.062880597495865734),
 ('disappointed', 0.062692353812072873),
 ('wasted', 0.061387183028051295),
 ('supposed', 0.060985452957725145),
 ('horrible', 0.060121772339380132),
 ('laughable', 0.058698406285467637),
 ('crap', 0.05810452866788459),
 ('basically', 0.057218840369636155),
 ('nothing', 0.057158220043034218),
 ('ridiculous', 0.056905481068931438),
 ('lacks', 0.055766565889465443),
 ('lame', 0.05561600905811017),
 ('avoid', 0.05551872607319721),
 ('unless', 0.054208926212940739),
 ('script', 0.053948359467048505),
 ('failed', 0.05341393055000912),
 ('pointless', 0.052855531546894118),
 ('oh', 0.05276158093317683),
 ('effort', 0.050773747127292324),
 ('guess', 0.050379576420076531),
 ('minutes', 0.049784532804242165),
 ('wooden', 0.049453108380727188),
 ('redeeming', 0.049182869114721743),
 ('seems', 0.049079625154669751),
 ('instead', 0.047957645123532268),
 ('weak', 0.046496387374765663),
 ('pathetic', 0.046099741149715767),
 ('looks', 0.045796536730244863),
 ('hoping', 0.045082242887577027),
 ('wonder', 0.044669791780934609),
 ('forgettable', 0.042854349251871718),
 ('silly', 0.042237829687270002),
 ('attempt', 0.041706299941373516),
 ('predictable', 0.041514442438568125),
 ('someone', 0.041506119027337307),
 ('sorry', 0.040868877281533364),
 ('might', 0.040445683500688355),
 ('slow', 0.040346869107034951),
 ('painful', 0.040220039039613263),
 ('thin', 0.040062642253777862),
 ('mediocre', 0.039407165377577394),
 ('garbage', 0.039310979440981109),
 ('money', 0.038907973313640508),
 ('none', 0.038300807052230962),
 ('bland', 0.03806224605708506),
 ('couldn', 0.03801666421895792),
 ('either', 0.037738833070341961),
 ('unfunny', 0.037076629805044503),
 ('entire', 0.036642119399463179),
 ('cheap', 0.036516800802525569),
 ('honestly', 0.03621204154379782),
 ('mildly', 0.035744850608185628),
 ('total', 0.035560454471013067),
 ('neither', 0.035415946043548557),
 ('making', 0.035244315060985597),
 ('problem', 0.035088251034562444),
 ('flat', 0.034518947038747076),
 ('bizarre', 0.034509460694521148),
 ('group', 0.034335883528586783),
 ('dreadful', 0.034287618511331872),
 ('ludicrous', 0.034159649323816044),
 ('decent', 0.033771585787868957),
 ('clich', 0.033751444631720563),
 ('daughter', 0.033732725858384882),
 ('bored', 0.033622879572852558),
 ('horror', 0.033464120619956829),
 ('writing', 0.033437913916756788),
 ('skip', 0.033430639850491169),
 ('absurd', 0.033154173530163325),
 ('barely', 0.032653416827517719),
 ('idea', 0.032584013175663229),
 ('wasn', 0.03248120796627206),
 ('fake', 0.032136435098031539),
 ('believe', 0.031677858935800801),
 ('uninteresting', 0.031526815915867146),
 ('reason', 0.031390715260270548),
 ('scenes', 0.031216362935389166),
 ('alright', 0.031046883113956265),
 ('body', 0.030999982945986656),
 ('no', 0.030917695380560422),
 ('insult', 0.030808450146355942),
 ('mst', 0.03052791647139786),
 ('nowhere', 0.030352177599338306),
 ('lousy', 0.030160195468380811),
 ('didn', 0.030115903194061412),
 ('interest', 0.029888118468771138),
 ('half', 0.02981324611505725),
 ('lee', 0.029804235955718662),
 ('dimensional', 0.029562861996904034),
 ('unconvincing', 0.029322607679950256),
 ('left', 0.029322408787030532),
 ('sex', 0.029296748476082161),
 ('even', 0.029225209450923415),
 ('far', 0.029192618334294554),
 ('tries', 0.029004001132703548),
 ('anything', 0.028988097743501123),
 ('trying', 0.02891947722846511),
 ('accent', 0.028779542310252575),
 ('nudity', 0.028662654953266045),
 ('apparently', 0.028291626941517923),
 ('zombies', 0.028178583120430676),
 ('sense', 0.028166740534758778),
 ('incoherent', 0.027988926190862518),
 ('something', 0.027986519420278216),
 ('tedious', 0.027952212405329527),
 ('wrong', 0.027831947557365632),
 ('were', 0.027825695799985409),
 ('endless', 0.027824591794431471),
 ('turkey', 0.027624266205058503),
 ('zombie', 0.027543333835110845),
 ('appears', 0.02746984087848324),
 ('embarrassing', 0.027425437142424347),
 ('walked', 0.027411768647042711),
 ('premise', 0.027346072285964196),
 ('ok', 0.027333008356232015),
 ('result', 0.027312558653191901),
 ('complete', 0.027247564384243458),
 ('t', 0.027186737465610237),
 ('least', 0.026949072632017304),
 ('was', 0.026917906772065289),
 ('unwatchable', 0.026829458762459388),
 ('sat', 0.026806511532143466),
 ('to', 0.026801902698524095),
 ('sadly', 0.026753380035391513),
 ('christmas', 0.026735555962199231),
 ('gore', 0.026670161630608386),
 ('mother', 0.026612696987437772),
 ('aspects', 0.026583237615263804),
 ('amateurish', 0.026565159291175696),
 ('below', 0.026548271016778154),
 ('stupidity', 0.026460990221946923),
 ('appeal', 0.02639659671342098),
 ('trite', 0.026331168557051407),
 ('then', 0.026284629203937666),
 ('rubbish', 0.026216695246125493),
 ('okay', 0.025981446095883619),
 ('sucks', 0.025930224401969338),
 ('pretentious', 0.025907912370628301),
 ('positive', 0.025773976409798768),
 ('confusing', 0.025737618729473628),
 ('remotely', 0.025699566061653027),
 ('obnoxious', 0.025454829745850262),
 ('m', 0.025435495928249209),
 ('rent', 0.025373441934038485),
 ('laughs', 0.025346512576104416),
 ('re', 0.025342239903627856),
 ('context', 0.02527438259371358),
 ('disgusting', 0.025195418263468182),
 ('so', 0.025148024611438818),
 ('tiresome', 0.025031684199042101),
 ('miscast', 0.024970026716882365),
 ('aren', 0.024968703889385893),
 ('forced', 0.024933299777713722),
 ('paid', 0.024906929703330333),
 ('utter', 0.024802282233385521),
 ('uninspired', 0.024799576212017459),
 ('falls', 0.024749631706810708),
 ('throw', 0.024614954073046709),
 ('been', 0.024470487429445055),
 ('ugly', 0.024334820044832371),
 ('hopes', 0.024315635652054305),
 ('dire', 0.024191221840051087),
 ('hunter', 0.024171291127418476),
 ('producers', 0.024089231997130232),
 ('seem', 0.024065146985976848),
 ('straight', 0.023996666451552164),
 ('vampire', 0.023942797574072684),
 ('paper', 0.023908828083961022),
 ('crappy', 0.023807255546688076),
 ('excited', 0.023764516357875836),
 ('start', 0.023739057832096767),
 ('material', 0.023729757962158735),
 ('excuse', 0.023681577270328113),
 ('cop', 0.02348067702892814),
 ('f', 0.023312251619610848),
 ('ms', 0.023282327986278321),
 ('villain', 0.023158273483660743),
 ('fest', 0.023091425711778243),
 ('lack', 0.023039437894325179),
 ('such', 0.023031161078650959),
 ('saving', 0.023025745893238071),
 ('clichs', 0.022928209200342317),
 ('enough', 0.022921397253925311),
 ('mistake', 0.022868689470374997),
 ('unbelievable', 0.022864325693347898),
 ('maybe', 0.022825002748295301),
 ('blame', 0.022808369279543168),
 ('bunch', 0.022769532876362852),
 ('version', 0.022753296945755484),
 ('candy', 0.022749363632616756),
 ('island', 0.02274580066608016),
 ('tripe', 0.022695188509832681),
 ('wasting', 0.022681371343356769),
 ('inept', 0.022679276425665775),
 ('actor', 0.022636975371771034),
 ('flop', 0.022613758633444527),
 ('any', 0.0225606084376072),
 ('k', 0.022554017579615043),
 ('appalling', 0.022500975853556055),
 ('propaganda', 0.022465024430755747),
 ('major', 0.022430482324246579),
 ('sequel', 0.022362296462477876),
 ('offensive', 0.022326080604825448),
 ('revenge', 0.02231515094247262),
 ('shoot', 0.022288105709211744),
 ('whatsoever', 0.022286498346940936),
 ('ruined', 0.022173811528211053),
 ('painfully', 0.022152008209040921),
 ('on', 0.022016020939730048),
 ('shame', 0.021981493467648276),
 ('effects', 0.021849482201960271),
 ('wouldn', 0.021848506706035161),
 ('development', 0.02177324199006574),
 ('plot', 0.021733893676650601),
 ('co', 0.021728673026887656),
 ('church', 0.021719723717009976),
 ('storyline', 0.021663404462350769),
 ('screenwriter', 0.021660177252485924),
 ('bother', 0.021571699909566977),
 ('miserably', 0.021516173872499812),
 ('christian', 0.021515873507543665),
 ('add', 0.021468134313277942),
 ('found', 0.021449077767987153),
 ('watching', 0.021344833140596587),
 ('pseudo', 0.021308384076023465),
 ('boredom', 0.021119995917930009),
 ('please', 0.021090765093296295),
 ('talent', 0.02100584744527479),
 ('continuity', 0.021005145852421917),
 ('talents', 0.020992716564348892),
 ('college', 0.020990718952374858),
 ('tried', 0.020978219626186824),
 ('editing', 0.020865814801443762),
 ('lines', 0.020853755408845782),
 ('drivel', 0.020726493692759695),
 ('generous', 0.020697017742242002),
 ('potential', 0.020672988272090829),
 ('creatures', 0.020601399429061321),
 ('disjointed', 0.020581338926655209),
 ('irritating', 0.020576764848872688),
 ('pile', 0.020560898967541544),
 ('acts', 0.020560043588043531),
 ('junk', 0.020558505639508208),
 ('raped', 0.020550629285133262),
 ('christ', 0.020481424289613533),
 ('brain', 0.020431161137662714),
 ('slasher', 0.020425652445140899),
 ('seconds', 0.020390927443421882),
 ('nobody', 0.020389268101762628),
 ('dialog', 0.020338349197601493),
 ('makers', 0.020333184431951135),
 ('excitement', 0.02029045602429182),
 ('flashbacks', 0.020267510512910248),
 ('sloppy', 0.020234078734398368),
 ('joke', 0.020212187048528524),
 ('sleep', 0.020108895811675787),
 ('bottom', 0.019986770547280187),
 ('however', 0.019981104962051181),
 ('fail', 0.019937405211620234),
 ('sucked', 0.019874923017311575),
 ('soap', 0.019853525395543015),
 ('looked', 0.01981021184092711),
 ('stinks', 0.019769365381781166),
 ('deserve', 0.019614034321096468),
 ('exact', 0.019555320028258997),
 ('substance', 0.019552647432498186),
 ('yeah', 0.019513150136671549),
 ('production', 0.019510696746296526),
 ('female', 0.019476914978121807),
 ('unintentional', 0.019387723280198933),
 ('army', 0.019364852889641616),
 ('minute', 0.019351862554568246),
 ('unrealistic', 0.019350657250497869),
 ('rescue', 0.019340920364464904),
 ('theater', 0.01933382927666848),
 ('monsters', 0.01933263601575104),
 ('frankly', 0.019326550823843887),
 ('children', 0.019314240606868871),
 ('convince', 0.019312073515560645),
 ('shallow', 0.01929844550493055),
 ('synopsis', 0.019259706392396589),
 ('scott', 0.01918347440557033),
 ('seriously', 0.019182027987150015),
 ('ridiculously', 0.019169300285178985),
 ('looking', 0.019150985439966572),
 ('kareena', 0.019110212601710665),
 ('wrote', 0.019015323411486425),
 ('attempts', 0.019006343780653929),
 ('bothered', 0.018970712777578523),
 ('utterly', 0.018924824767803397),
 ('giant', 0.018891084650049701),
 ('writers', 0.018868906582101285),
 ('atrocious', 0.018848042351202358),
 ('plain', 0.018828766525513588),
 ('presumably', 0.018826629750947947),
 ('example', 0.018796453237837171),
 ('murray', 0.018754173430046935),
 ('seemed', 0.018749132295913074),
 ('stay', 0.01874415970643269),
 ('interview', 0.018672085964709539),
 ('disaster', 0.018553283301235162),
 ('value', 0.01854408095516636),
 ('paint', 0.01852960713242937),
 ('original', 0.018528190682362413),
 ('difficult', 0.018518455298178593),
 ('care', 0.018494804801171254),
 ('watchable', 0.018481870605389094),
 ('useless', 0.018470481000366863),
 ('desperately', 0.018421675047000253),
 ('except', 0.01839199355123854),
 ('doing', 0.01838473762135065),
 ('errors', 0.018380414978330265),
 ('solely', 0.018349321075079396),
 ('sitting', 0.018346519170301074),
 ('giving', 0.018335957397904838),
 ('ideas', 0.018327099221245202),
 ('unbearable', 0.018321159676201407),
 ('advice', 0.01827337252768884),
 ('nor', 0.018254420259554288),
 ('project', 0.018252633214771753),
 ('dozen', 0.018206363291515752),
 ('charles', 0.018163660578293449),
 ('plastic', 0.018161741020378656),
 ('book', 0.018139011699011283),
 ('shots', 0.01811487606436386),
 ('ill', 0.018103621818215749),
 ('grade', 0.018088309511242365),
 ('where', 0.01806588259969516),
 ('women', 0.018026883825059358),
 ('screenplay', 0.018014307024101332),
 ('through', 0.017990863003241403),
 ('actress', 0.017876003487857159),
 ('sign', 0.01786563614405693),
 ('walk', 0.017823522607756635),
 ('santa', 0.017727102733219185),
 ('happens', 0.017722408798843597),
 ('contrived', 0.017720303645882791),
 ('gun', 0.01768599317693385),
 ('ashamed', 0.017679623098721588),
 ('gratuitous', 0.017665737783803856),
 ('one', 0.017608259344043274),
 ('not', 0.017562336441189895),
 ('credibility', 0.017558852870687959),
 ('promising', 0.017544417082572289),
 ('risk', 0.017532600100721243),
 ('sub', 0.017531947750389475),
 ('lacking', 0.017513759836446534),
 ('fell', 0.017464857159331278),
 ('scenery', 0.017451365955319955),
 ('flesh', 0.017402514298262693),
 ('animal', 0.017386681692205429),
 ('tired', 0.017383214541566692),
 ('writer', 0.017380887757560838),
 ('lady', 0.017370657212565484),
 ('dialogue', 0.01731937394664762),
 ('terribly', 0.017291135257276886),
 ('downright', 0.01727767556320545),
 ('rented', 0.017247977656900719),
 ('clumsy', 0.017241290805182087),
 ('blah', 0.017217377177396766),
 ('random', 0.017199913549248002),
 ('members', 0.017198947117344762),
 ('three', 0.017189383912215916),
 ('celluloid', 0.017174000803758884),
 ('your', 0.017140173886430042),
 ('lost', 0.017127763322061808),
 ('suddenly', 0.017124566068806118),
 ('cover', 0.017066680835874291),
 ('existent', 0.017028540662919339),
 ('mostly', 0.017009366180205376),
 ('dig', 0.016990887715494295),
 ('spending', 0.016944400877991022),
 ('elsewhere', 0.016937877167916528),
 ('suck', 0.016897737192407586),
 ('apparent', 0.016783874225807266),
 ('fill', 0.016766110935370601),
 ('running', 0.016728621099996378),
 ('jokes', 0.016718920312228033),
 ('cheese', 0.016699473014889846),
 ('outer', 0.016612591391981471),
 ('anil', 0.01658120084065488),
 ('director', 0.016512894450311441),
 ('awfully', 0.016492200414985309),
 ('mix', 0.016468214294032515),
 ('naturally', 0.016404879835269448),
 ('scientist', 0.016395078905109241),
 ('imdb', 0.01634316803410717),
 ('dumb', 0.016289693549692456),
 ('made', 0.016279809910441426),
 ('curiosity', 0.016277433551029969),
 ('somewhere', 0.01623611744674798),
 ('stereotyped', 0.016235814767295298),
 ('officer', 0.016235401039884575),
 ('shelf', 0.016151304702362458),
 ('spends', 0.016089566181633222),
 ('explanation', 0.016040330428242221),
 ('proof', 0.016021381235154286),
 ('killed', 0.016004979798664873),
 ('songs', 0.016002280189188107),
 ('why', 0.015994497048455198),
 ('adequate', 0.015978003410591614),
 ('assume', 0.015953574865902442),
 ('mean', 0.015907137878947288),
 ('year', 0.015900265748875868),
 ('named', 0.015897377296493424),
 ('actors', 0.015880849255718716),
 ('dreck', 0.015844184837849277),
 ('ripped', 0.015809352391222237),
 ('exception', 0.015801037653546939),
 ('let', 0.015747554995806858),
 ('said', 0.015739206756809145),
 ('handed', 0.015729421480492778),
 ('five', 0.015692627471399451),
 ('manage', 0.015647108880417111),
 ('thousands', 0.01564343097589297),
 ('faith', 0.015616976955551873),
 ('hideous', 0.015589158171890802),
 ('alas', 0.015538213296394246),
 ('interesting', 0.015537431607034398),
 ('camera', 0.015534217771859269),
 ('affair', 0.015499371820329408),
 ('basketball', 0.015498025904813832),
 ('saved', 0.015479619606949029),
 ('allow', 0.015471290657970005),
 ('embarrassed', 0.015465690911012363),
 ('historically', 0.015405093934372959),
 ('guy', 0.015377641254470054),
 ('smoking', 0.015346508854378342),
 ('implausible', 0.015340453986022747),
 ('entirely', 0.015334692788183644),
 ('insulting', 0.015328508644691506),
 ('unable', 0.015321433538157151),
 ('supposedly', 0.015316107621242407),
 ('replaced', 0.015263381265213496),
 ('write', 0.015247349730647843),
 ('devoid', 0.015196181920380176),
 ('angry', 0.015128878425101425),
 ('cannot', 0.015124671278970781),
 ('stinker', 0.015117424017513684),
 ('types', 0.015097306608067001),
 ('hype', 0.015076288365524309),
 ('responsible', 0.014991356276561571),
 ('peter', 0.014969127137333013),
 ('putting', 0.014910707254937238),
 ('over', 0.014897181020826432),
 ('cardboard', 0.014888714204149053),
 ('interspersed', 0.014883165331874141),
 ('haired', 0.014880449676198559),
 ('spend', 0.014876094316227655),
 ('elvis', 0.014854709844151744),
 ('indulgent', 0.014847232132387195),
 ('catholic', 0.014843519648135949),
 ('downhill', 0.014807184967767799),
 ('lazy', 0.01478151469522973),
 ('aged', 0.014773315829198599),
 ('exist', 0.014753607788843272),
 ('torture', 0.014733998799388378),
 ('prove', 0.014729418674653008),
 ('tolerable', 0.014680880104255795),
 ('four', 0.014654547592632501),
 ('acceptable', 0.014651730694965849),
 ('chick', 0.01464142839879883),
 ('unimaginative', 0.014629366067627063),
 ('whiny', 0.014626751487134583),
 ('artsy', 0.014597921349167282),
 ('decide', 0.014596087755808985),
 ('unpleasant', 0.014539257963097203),
 ('rotten', 0.014526987482368666),
 ('racist', 0.014521318292204648),
 ('air', 0.014513999400043543),
 ('flimsy', 0.014510298364381129),
 ('baldwin', 0.014458793249711607),
 ('merely', 0.014423588430956464),
 ('wood', 0.014405182128559187),
 ('thinking', 0.014365675477621544),
 ('earth', 0.01435295387020083),
 ('kidding', 0.014337420788166327),
 ('unintentionally', 0.014336443850996718),
 ('vampires', 0.014325905430975228),
 ('generic', 0.014319871170399826),
 ('defense', 0.014290336242912221),
 ('saif', 0.014289573796132724),
 ('asleep', 0.014289012435576953),
 ('execution', 0.014283962008273423),
 ('figure', 0.014283770855230152),
 ('lackluster', 0.014273058981901452),
 ('hoped', 0.01426472476234584),
 ('nonsense', 0.01426134149720314),
 ('horrid', 0.014253216604458432),
 ('god', 0.01423736354744793),
 ('l', 0.014187296773742582),
 ('caricatures', 0.01418156420832665),
 ('starts', 0.014153430344591598),
 ('dry', 0.014133935534427948),
 ('display', 0.014128179969827093),
 ('button', 0.014116471162614747),
 ('bore', 0.014116389381443268),
 ('empty', 0.014096772700681898),
 ('harold', 0.014052130896646567),
 ('incomprehensible', 0.014009428713655193),
 ('annie', 0.014008405850952518),
 ('thrown', 0.014007462594894679),
 ('incredibly', 0.014005185007294368),
 ('renting', 0.013926687608630483),
 ('connect', 0.013922471736926742),
 ('younger', 0.013921148395141745),
 ('author', 0.013908729139553409),
 ('mistakes', 0.013902060662024722),
 ('vague', 0.013900188409028458),
 ('susan', 0.013899718009237958),
 ('obvious', 0.013862928310275269),
 ('public', 0.013848261281553191),
 ('porn', 0.013842110384054578),
 ('trash', 0.013803990572178482),
 ('stevens', 0.013796967244647431),
 ('sequels', 0.01378246386147269),
 ('hurt', 0.013769543921240126),
 ('desert', 0.013763619124969737),
 ('did', 0.013737639449728188),
 ('behave', 0.013719767167839484),
 ('served', 0.013714838239223709),
 ('claims', 0.013706886269650513),
 ('ultimately', 0.013697643591100159),
 ('wide', 0.013685211021307755),
 ('wow', 0.013679184770624811),
 ('worthless', 0.013670533296298292),
 ('dear', 0.013653591379600139),
 ('plodding', 0.013622845840855246),
 ('mike', 0.013594086031988721),
 ('favor', 0.013578310381078498),
 ('call', 0.013577646631327928),
 ('biggest', 0.01352994758638958),
 ('worthy', 0.013524754842185325),
 ('meaning', 0.013517997531900566),
 ('scientific', 0.013515396653842859),
 ('hanks', 0.013467213376215903),
 ('ads', 0.013463653421760934),
 ('gay', 0.013414840808688232),
 ('embarrassingly', 0.013401336286973735),
 ('literary', 0.013389208999321041),
 ('playing', 0.01332995463472637),
 ('bo', 0.013312890564682506),
 ('manipulative', 0.013287016941406332),
 ('dressed', 0.013285092423656565),
 ('embarrassment', 0.013269530319198222),
 ('regarding', 0.01323325021163166),
 ('stilted', 0.013215539220141913),
 ('sleeve', 0.013215085161586725),
 ('rating', 0.013203442200940888),
 ('kills', 0.013183919467358739),
 ('sounds', 0.013178727878711719),
 ('ali', 0.013173031266866373),
 ('non', 0.01316260375180525),
 ('pie', 0.013161492629253844),
 ('populated', 0.013152746747459268),
 ('killing', 0.0131118608531518),
 ('else', 0.013110592541316701),
 ('schneider', 0.013093514941690407),
 ('priest', 0.013071537555948207),
 ('hollow', 0.013068001463175461),
 ('shower', 0.013029604174841071),
 ('ruins', 0.013021597567104503),
 ('mental', 0.013019696244479819),
 ('this', 0.01300977816966453),
 ('pregnant', 0.012997074834619548),
 ('make', 0.012992851916498661),
 ('timberlake', 0.012979689860020446),
 ('saves', 0.012915795355367856),
 ('vastly', 0.012914828969565759),
 ('swear', 0.012901059475490067),
 ('stella', 0.012883911119651204),
 ('grave', 0.01288255504027714),
 ('thats', 0.012861061812910347),
 ('drinking', 0.0128601294710197),
 ('boom', 0.012851779594694182),
 ('introduction', 0.012831129197335457),
 ('programming', 0.012796219757750261),
 ('career', 0.012773059501084118),
 ('stereotype', 0.012769447626661462),
 ('attractive', 0.012765873120010152),
 ('victims', 0.012749299245502175),
 ('pass', 0.012735021821089293),
 ('experiment', 0.012716112941788907),
 ('retarded', 0.012713099529852416),
 ('stuck', 0.01270933269825327),
 ('akshay', 0.012684273069877868),
 ('cut', 0.012676285239015489),
 ('shoddy', 0.012674792040888049),
 ('damme', 0.012666536417656674),
 ('inaccurate', 0.01265368757753655),
 ('ray', 0.012649818023510173),
 ('woman', 0.012646521945546347),
 ('research', 0.01264049466286456),
 ('mile', 0.012627245693716727),
 ('place', 0.012624645831509405),
 ('demon', 0.012621688470792602),
 ('vulgar', 0.012612150302693321),
 ('engage', 0.012602272831074856),
 ('wives', 0.012601890190118297),
 ('mention', 0.01258159848000647),
 ('if', 0.012569631262234718),
 ('cartoon', 0.012561864177985766),
 ('unbelievably', 0.012550391668315839),
 ('only', 0.012517107727859139),
 ('ended', 0.012507282716729802),
 ('stereotypical', 0.012506426536204353),
 ('spent', 0.012503032775055236),
 ('thing', 0.012483110991541414),
 ('phone', 0.012464039991489125),
 ('stock', 0.012446742147556611),
 ('drop', 0.012432978683590463),
 ('self', 0.012432059211520796),
 ('headache', 0.012424495134195477),
 ('escapes', 0.012419211298248923),
 ('conceived', 0.012392639977060707),
 ('required', 0.012392260947042837),
 ('assassin', 0.012332404091910091),
 ('meat', 0.012327751187890434),
 ('therefore', 0.012316138729629621),
 ('struggling', 0.0123086283535723),
 ('ho', 0.01230771493626571),
 ('ta', 0.012299409649320241),
 ('cold', 0.012289510775209263),
 ('expects', 0.012271684887263186),
 ('furthermore', 0.012263298696316208),
 ('remote', 0.012254529263879219),
 ('cgi', 0.012250569964074179),
 ('arab', 0.01223023211522525),
 ('feminist', 0.012220004405980549),
 ('hair', 0.012213792907949602),
 ('intelligence', 0.012203964889416771),
 ('destroy', 0.012190213907023963),
 ('cameo', 0.012186034087855138),
 ('claus', 0.012181510618531247),
 ('awake', 0.012171290237450149),
 ('sums', 0.012139945909251909),
 ('auto', 0.012126012687040621),
 ('cue', 0.012120943623008968),
 ('speak', 0.012117784815618111),
 ('stereotypes', 0.012106976159466589),
 ('footage', 0.012103658001584288),
 ('maker', 0.012093369539270352),
 ('rental', 0.012083052888147327),
 ('proper', 0.012063210621690411),
 ('mercifully', 0.012047936344961967),
 ('gimmick', 0.012041001769926649),
 ('coherent', 0.012027899920693617),
 ('inane', 0.011993175877578831),
 ('relies', 0.011992345660343814),
 ('nomination', 0.011982252573531256),
 ('segal', 0.011947340234058409),
 ('christians', 0.011946398905489906),
 ('overrated', 0.011926101166626018),
 ('don', 0.011924357980777282),
 ('severely', 0.01191616855223732),
 ('phony', 0.011913822393121729),
 ('selfish', 0.011900529017180242),
 ('resume', 0.011897346320859059),
 ('another', 0.011877684431361637),
 ('sean', 0.011876040214137613),
 ('hepburn', 0.011869243078008906),
 ('secondly', 0.011863109334450284),
 ('ups', 0.011859394818287424),
 ('planet', 0.011852030247443588),
 ('changed', 0.011845335611887487),
 ('amused', 0.011842962845878574),
 ('lowest', 0.011831634819501927),
 ('fools', 0.011824116232842368),
 ('spelling', 0.011821902194872625),
 ('repressed', 0.011821527286346349),
 ('unlikeable', 0.011818760110586484),
 ('failure', 0.011816519901709054),
 ('line', 0.0117964385718739),
 ('hyped', 0.011784666544684314),
 ('anti', 0.011764086315539176),
 ('acting', 0.01175234831420538),
 ('promise', 0.011749711660046633),
 ('observe', 0.011739608959278629),
 ('mindless', 0.011729368774426886),
 ('lacked', 0.011718485221863714),
 ('rather', 0.011704535222487896),
 ('ed', 0.011700096242496991),
 ('significant', 0.011696176501939935),
 ('talks', 0.01167810147608688),
 ('arty', 0.011674972481678904),
 ('spit', 0.011671408526135135),
 ('ilk', 0.011661568455359036),
 ('unoriginal', 0.01165110724584089),
 ('forward', 0.01164671953310609),
 ('toilet', 0.011635522207639078),
 ('suppose', 0.011633258510072188),
 ('feed', 0.011617447517425158),
 ('surrounded', 0.011607897169523131),
 ('wanted', 0.011604506869089728),
 ('tashan', 0.011596205445299112),
 ('dr', 0.011543949281335654),
 ('scare', 0.011543316667712916),
 ('murderer', 0.011535350571639669),
 ('explained', 0.011466329649783226),
 ('cheated', 0.011455846970137717),
 ('whats', 0.011451443577230852),
 ('romance', 0.011445558616225333),
 ('jewish', 0.01144156416364368),
 ('sexual', 0.011438682797255694),
 ('books', 0.011419811777535165),
 ('throwing', 0.011404165894740245),
 ('nose', 0.01139558365172063),
 ('parking', 0.011390688400833916),
 ('pick', 0.011357671445382181),
 ('chose', 0.011354353327826125),
 ('improve', 0.011350584813053916),
 ('kapoor', 0.01134076781407491),
 ('costs', 0.01132590072689099),
 ('saying', 0.011325617629551343),
 ('early', 0.011320525734188101),
 ('technically', 0.011317672837061943),
 ('hackman', 0.011288294849240654),
 ('birthday', 0.011282785404027751),
 ('cinematography', 0.011263572785831703),
 ('hurts', 0.011250154303091528),
 ('saturday', 0.011247837147971248),
 ('meaningless', 0.011239510238506721),
 ('mannered', 0.011239044207972258),
 ('screaming', 0.011238620310222368),
 ('should', 0.011236648355832369),
 ('crazed', 0.011236418275421323),
 ('dignity', 0.011236150963786553),
 ('mate', 0.011216700009844505),
 ('letters', 0.011208675517174485),
 ('recycled', 0.011206236378205579),
 ('promptly', 0.011202237607822152),
 ('inexplicably', 0.011161321811546261),
 ('or', 0.011152965343305354),
 ('simply', 0.011146233896835885),
 ('too', 0.011130044921930279),
 ('nerd', 0.011122543127721434),
 ('chris', 0.011116119389820139),
 ('proceedings', 0.011111786695547103),
 ('lived', 0.011100598930695578),
 ('code', 0.011095425242701427),
 ('potentially', 0.011093285835678517),
 ('open', 0.011075631889800956),
 ('faster', 0.011074177906888303),
 ('moore', 0.011070458274337771),
 ('bowl', 0.011060417562531434),
 ('absolutely', 0.011044130796846869),
 ('just', 0.011033356854991551),
 ('suspension', 0.01103178117307213),
 ('enemy', 0.011025820754518637),
 ('conclusion', 0.010986051066943354),
 ('hospital', 0.010977494845678703),
 ('romances', 0.010962761722118314),
 ('spoke', 0.010962116403553664),
 ('hardly', 0.010960545391113449),
 ('olds', 0.010951344004097443),
 ('creek', 0.01095002392432287),
 ('shouting', 0.010943727502542746),
 ('originality', 0.010912963822714929),
 ('bollywood', 0.010911409137577788),
 ('cape', 0.010902326129518278),
 ('teeth', 0.010900502046002621),
 ('backdrop', 0.010885688008708724),
 ('turn', 0.01088047805942566),
 ('mason', 0.010866951716170666),
 ('grace', 0.01084840625738232),
 ('valley', 0.01084518042587585),
 ('depressing', 0.010827818086738501),
 ('superficial', 0.010826403237558541),
 ('invested', 0.01081248871664086),
 ('bomb', 0.010811727591767125),
 ('embarrass', 0.010778451069403571),
 ('sided', 0.010773707983617689),
 ('sticking', 0.01076229243554771),
 ('common', 0.010754536408451015),
 ('boat', 0.010750196487059141),
 ('promised', 0.010746025901289745),
 ('wayans', 0.010744338945929417),
 ('sheer', 0.010734103279474513),
 ('wrestling', 0.010724515540975419),
 ('staff', 0.010715523520497063),
 ('apollo', 0.010711377643774774),
 ('leigh', 0.010702080598678559),
 ('virtually', 0.010691942663824013),
 ('seagal', 0.01067732410067212),
 ('comes', 0.010674899719725496),
 ('edition', 0.010673353805904192),
 ('predictably', 0.010666551243955748),
 ('stuff', 0.01066491581148326),
 ('gang', 0.010664441184213122),
 ('cancer', 0.010643225900463581),
 ('obviously', 0.01064167008065452),
 ('would', 0.010623530922231171),
 ('totally', 0.010616092995147892),
 ('profile', 0.010596003501785219),
 ('spacey', 0.010595967407784398),
 ('ability', 0.010584592521360157),
 ('horrendous', 0.010580213328532092),
 ('blood', 0.010579520401095324),
 ('imitation', 0.010568550630572965),
 ('bikini', 0.010568043371931096),
 ('talented', 0.01056600103597944),
 ('basis', 0.010564729746933198),
 ('dialogs', 0.010551191397294017),
 ('showing', 0.010548613564454233),
 ('door', 0.010544563357219766),
 ('portray', 0.010527799628490622),
 ('strictly', 0.010526959295132313),
 ('mexican', 0.01050873151782233),
 ('stick', 0.010465961443388676),
 ('east', 0.010455324716016763),
 ('anywhere', 0.010431532734666286),
 ('remake', 0.010419869194952832),
 ('am', 0.010410414209203927),
 ('attempting', 0.010386393998627379),
 ('disturbing', 0.010381152608581428),
 ('jude', 0.010377136500506758),
 ('wondering', 0.010363512690012205),
 ('celebrated', 0.010360111769075867),
 ('use', 0.010350554074714642),
 ('wreck', 0.010344734410393918),
 ('appear', 0.010344438351539182),
 ('entitled', 0.010335246001593067),
 ('youth', 0.010323214445994808),
 ('letdown', 0.010318553446258687),
 ('moran', 0.010305507693633363),
 ('mediocrity', 0.010302827140695378),
 ('news', 0.010292874788426099),
 ('bits', 0.010276065293631165),
 ('alone', 0.010268492053981962),
 ('accents', 0.010263852094534696),
 ('inhabited', 0.010244117693024819),
 ('mock', 0.01024406136067591),
 ('g', 0.01022345817540379),
 ('box', 0.010203304329265748),
 ('term', 0.010199983044386105),
 ('behavior', 0.010198776124373247),
 ('tedium', 0.010190092201507222),
 ('intent', 0.01019003812069858),
 ('husband', 0.010189502265957835),
 ('presence', 0.01018719233607417),
 ('z', 0.010184318583214768),
 ('unappealing', 0.010146391189444371),
 ('much', 0.010136790117697142),
 ('tree', 0.010113534581593916),
 ('doctors', 0.010099854380484188),
 ('pi', 0.010095099419111339),
 ('rodney', 0.010090819798082384),
 ('franchise', 0.010089650929674208),
 ('piece', 0.01008601154958534),
 ('company', 0.01008353958260105),
 ('choppy', 0.010079223420593737),
 ('turned', 0.010069855547990135),
 ('test', 0.010041505355613904),
 ('ball', 0.010040944323609536),
 ('hated', 0.010035509058945865),
 ('bear', 0.010034272465057467),
 ('serves', 0.010027495172169233),
 ('leonard', 0.010022751390164696),
 ('deserved', 0.010022334081283366),
 ('part', 0.010016360436147429),
 ('opportunity', 0.010013126012646699),
 ('turning', 0.010011850960865766),
 ('overacting', 0.010008994714980214),
 ('refer', 0.010006488920574083),
 ('flies', 0.010006418749637636),
 ('uninvolving', 0.0099991338976208183),
 ('produce', 0.0099962014038013757),
 ('jumpy', 0.0099947855808415163),
 ('die', 0.0099914129058671069),
 ('root', 0.009974713500112831),
 ('insomnia', 0.0099744642555285121),
 ('blatant', 0.0099596620005663883),
 ('larry', 0.0099556905367902595),
 ('threw', 0.0099473965388449607),
 ('billed', 0.0099285818753670936),
 ('bullets', 0.0099281758971005961),
 ('intellectually', 0.0099081388278786167),
 ('rip', 0.0099013233996040912),
 ('stretching', 0.0099012969699172667),
 ('protest', 0.0098984552675623651),
 ('soldiers', 0.0098936923822449136),
 ('flick', 0.0098870633649776659),
 ('justin', 0.009862246602717565),
 ('highlights', 0.0098589088020586361),
 ('move', 0.0098539899809540355),
 ('merit', 0.0098431205949966755),
 ('russian', 0.0098411717219841037),
 ('security', 0.0098373450338831159),
 ('idiotic', 0.009834123428814465),
 ('produced', 0.0098294307574257923),
 ('king', 0.0098266872343175677),
 ('magically', 0.0098228842476825624),
 ('united', 0.0098070847890707729),
 ('missile', 0.0097990578193348568),
 ('unlikable', 0.009786915898648085),
 ('ignorant', 0.0097732743173461027),
 ('amateur', 0.0097674059870561138),
 ('bachelor', 0.0097673429455405782),
 ('asylum', 0.009762733851977996),
 ('screw', 0.0097568098573927176),
 ('report', 0.00974792326991724),
 ('dracula', 0.009746732339320564),
 ('removed', 0.0097416519499422139),
 ('confess', 0.0097162925211573287),
 ('brand', 0.0097152534660907616),
 ('conspiracy', 0.0097116972290397074),
 ('horribly', 0.0097083785564252515),
 ('switch', 0.009702684093379552),
 ('jaws', 0.0096877455513713108),
 ('unsuspecting', 0.0096853425035846458),
 ('betty', 0.0096770352133324754),
 ('forwarding', 0.0096711196893192845),
 ('university', 0.0096636715878149534),
 ('star', 0.00966232549318005),
 ('crawl', 0.0096464318968590597),
 ('dopey', 0.0096460863315858646),
 ('ruin', 0.0096230106385457228),
 ('lifeless', 0.0096228807274879972),
 ('flash', 0.0096193625359649992),
 ('whoever', 0.0096174128915875404),
 ('coincidence', 0.0096024599741402171),
 ('choosing', 0.0095951100051069257),
 ('avid', 0.0095900913284222671),
 ('intended', 0.0095846987041676244),
 ('remained', 0.0095839628178583831),
 ('c', 0.0095732676681762538),
 ('waiting', 0.0095562258694349058),
 ('cassie', 0.0095481354442238115),
 ('garage', 0.0095349544587830255),
 ('clarke', 0.0095345445855698676),
 ('fortune', 0.0095330396648302066),
 ('interminable', 0.0095328159563552659),
 ('incessant', 0.0095235485026846384),
 ('plots', 0.0095225805490624735),
 ('danger', 0.0095171205654693055),
 ('costumes', 0.0094980144667524517),
 ('evidently', 0.0094952158467012243),
 ('minus', 0.0094911495174661246),
 ('reporters', 0.009483681104099086),
 ('israeli', 0.0094750077183364655),
 ('failing', 0.0094711841313976971),
 ('paying', 0.00946923440668513),
 ('godzilla', 0.0094586915548437837),
 ('dumber', 0.0094582903092924851),
 ('earn', 0.0094476224928425039),
 ('slows', 0.0094467463872487598),
 ('held', 0.0094452736817914867),
 ('chase', 0.0094438362611946377),
 ('lies', 0.0094383969845033416),
 ('hands', 0.0094381781614589176),
 ('grief', 0.00942384945341029),
 ('brains', 0.0094182153416632122),
 ('tom', 0.009413043338434738),
 ('resurrected', 0.0094083423437290557),
 ('asking', 0.0094021029403453284),
 ('sleeps', 0.0094017951882658275),
 ('porno', 0.0093907201413965073),
 ('somehow', 0.0093889261270860523),
 ('sarcasm', 0.0093886064393904119),
 ('tie', 0.0093856009366311537),
 ('fall', 0.0093801640008931257),
 ('bring', 0.0093791273545761489),
 ('rape', 0.0093760851230746418),
 ('village', 0.0093684513318614097),
 ('kitchen', 0.0093649071460109538),
 ('concerned', 0.0093611353238811489),
 ('republic', 0.0093499426948764237),
 ('hell', 0.0093400360705317223),
 ('inducing', 0.0093382129792553593),
 ('stomach', 0.009337828638515849),
 ('shambles', 0.009333545732982982),
 ('virgin', 0.0093312001339055945),
 ('extraneous', 0.009325041380035138),
 ('cameras', 0.0093229460267977189),
 ('suffers', 0.0093204929924830034),
 ('justified', 0.009316321747936309),
 ('plummer', 0.0092948273285103945),
 ('ponderous', 0.0092880344237223321),
 ('player', 0.0092802296345443781),
 ('survivor', 0.0092767026472125765),
 ('rainy', 0.0092697034218137461),
 ('graces', 0.0092620944963291273),
 ...]
In [83]:
import matplotlib.colors as colors

words_to_visualize = list()
for word, ratio in pos_neg_ratios.most_common(500):
    if(word in mlp_full.word2index.keys()):
        words_to_visualize.append(word)
    
for word, ratio in list(reversed(pos_neg_ratios.most_common()))[0:500]:
    if(word in mlp_full.word2index.keys()):
        words_to_visualize.append(word)
In [84]:
pos = 0
neg = 0

colors_list = list()
vectors_list = list()
for word in words_to_visualize:
    if word in pos_neg_ratios.keys():
        vectors_list.append(mlp_full.weights_0_1[mlp_full.word2index[word]])
        if(pos_neg_ratios[word] > 0):
            pos+=1
            colors_list.append("#00ff00")
        else:
            neg+=1
            colors_list.append("#000000")
    
In [85]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)
In [87]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="vector T-SNE for most polarized words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_to_visualize,
                                    color=colors_list))

p.scatter(x="x1", y="x2", size=8, source=source, fill_color="color")

word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(word_labels)

show(p)

# green indicates positive words, black indicates negative words
/anaconda3/envs/sentiment/lib/python3.6/site-packages/bokeh/util/deprecation.py:34: BokehDeprecationWarning: 
Supplying a user-defined data source AND iterable values to glyph methods is deprecated.

See https://github.com/bokeh/bokeh/issues/2056 for more information.

  warn(message)
/anaconda3/envs/sentiment/lib/python3.6/site-packages/bokeh/util/deprecation.py:34: BokehDeprecationWarning: 
Supplying a user-defined data source AND iterable values to glyph methods is deprecated.

See https://github.com/bokeh/bokeh/issues/2056 for more information.

  warn(message)