# Import requisite packages
# setup global variables
# and read in data
import requests
import csv
import time
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import numpy as np
# web address for the "General Chess Discussion" forum on Chess.com
first_page_url = "https://www.chess.com/forum/category/general"
# date representing the farthest back in the forum you wish to post
# not precise, might pull in a few posts older than defined stop date
stop_date = datetime(2020, 3, 12)

# get_soup() is modified from code shown in a lecture by Sean McGrath @ https://www.coursera.org/learn/uol-cm2015-programming-with-data/lecture/kWE1l/5-05-introduction-to-web-scraping
def get_soup(URL, jar=None, print_output=False):
    if print_output:
        print("scraping {}".format(URL))
    request_headers = {
        "update-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/2010 0101 Firefox/47.0",
        "accept": "*/*",
        "accept-encoding": "gzip, deflate, br",
        "accept-language": "en-US,en;q=0.8"
    }
    if jar:
        r = requests.get(URL, cookies=jar, headers=request_headers)
    else:
        r = requests.get(URL, headers=request_headers)
        jar = requests.cookies.RequestsCookieJar()
    data = r.text
    soup = BeautifulSoup(data, "html.parser")
    return soup, jar

# function for retrieving all the relevant posts on a particular page
def get_page_post_links(soup):
    # make a container to hold and work with collected posts
    # and keep track of the current date
    post_links = []
    earliest_post_date = None
    # get post thumbnails with bs4
    post_preview_elements = soup.find_all("tr", class_="forums-category-thread-row")
    # extract links and dates from previews
    for post_preview_element in post_preview_elements:
        post_links.append(
            post_preview_element.find_all(
                "a",
                class_="forums-category-thread-title"
            )[0]['href']
        )
        latest_elem = post_preview_element.find_all("div", class_="forums-category-thread-latest-row")
        time_elem = latest_elem[1].find_all("span")
        earliest_post_date = time_elem[0]['title']
    earliest_post_date = datetime.strptime(earliest_post_date, "%b %d, %Y, %I:%M:%S %p")
    return post_links, earliest_post_date

# function for paginating through the forum and retrieving all posts submitted
# after a user-defined start date (stop_date)
def get_posts_over_time_period(url, stop_date):
    # assign the control variable the default time of now
    # as we know this will be later than the input stop_data
    earliest_post_date_on_page = datetime.now()
    # container for post links
    all_post_links = []
    soup, jar = None, None
    num_collected = 0
    # while the earliest found data on the page is still greater
    # then our desired stop point, continue collecting all post links
    while (earliest_post_date_on_page > stop_date):
        # print output periodically for user monitoring
        print_output = num_collected % 2000 == 0
        if print_output:
            print(
                "{} > {}: proceeding".format(
                    earliest_post_date_on_page.strftime("%b %d, %Y, %I:%M:%S %p"),
                    stop_date.strftime("%b %d, %Y, %I:%M:%S %p")
                )
            )
        # if first call, collect cookies, otherwise include collected
        # cookies, find link to next page, and implement wait to avoid 
        # being flagged as a bot and reduce strain on chess.com's servers
        if (soup == None):
            soup, jar = get_soup(url, None, print_output)
        else:
            link_to_next_page = soup.find("a", class_="pagination-next")['href']
            time.sleep(2)
            soup, jar = get_soup(link_to_next_page, jar, print_output)
        # get all links to posts on page
        post_links, earliest_post_date_on_page = get_page_post_links(soup)
        # add to container
        all_post_links.extend(post_links)
        # updated our tracker for output printing
        num_added = len(post_links)
        num_collected += num_added
        if print_output:
            print("added {} posts".format(num_added))
            print("{} collected".format(num_collected))
    return pd.DataFrame(all_post_links, index=range(len(all_post_links)), columns=['link'])

post_links_df = get_posts_over_time_period(first_page_url, stop_date)
post_links_df.to_csv("data/forum_page_links.csv")

Jun 19, 2021, 05:54:21 PM > Mar 12, 2020, 12:00:00 AM: proceeding
scraping https://www.chess.com/forum/category/general
added 20 posts
20 collected
May 25, 2021, 05:13:05 AM > Mar 12, 2020, 12:00:00 AM: proceeding
scraping https://www.chess.com/forum/category/general?page=101
added 20 posts
2020 collected
May 02, 2021, 03:08:11 AM > Mar 12, 2020, 12:00:00 AM: proceeding
scraping https://www.chess.com/forum/category/general?page=201
added 20 posts
4020 collected
Apr 09, 2021, 03:35:35 PM > Mar 12, 2020, 12:00:00 AM: proceeding
scraping https://www.chess.com/forum/category/general?page=301
added 20 posts
6020 collected
Mar 12, 2021, 01:19:36 PM > Mar 12, 2020, 12:00:00 AM: proceeding
scraping https://www.chess.com/forum/category/general?page=401
added 20 posts
8020 collected
Feb 15, 2021, 04:49:00 PM > Mar 12, 2020, 12:00:00 AM: proceeding
scraping https://www.chess.com/forum/category/general?page=501
added 20 posts
10020 collected
Jan 20, 2021, 10:45:19 AM > Mar 12, 2020, 12:00:00 AM: proceeding
scraping https://www.chess.com/forum/category/general?page=601
added 20 posts
12020 collected
Dec 18, 2020, 05:33:41 PM > Mar 12, 2020, 12:00:00 AM: proceeding
scraping https://www.chess.com/forum/category/general?page=701
added 20 posts
14020 collected
Nov 13, 2020, 06:39:45 PM > Mar 12, 2020, 12:00:00 AM: proceeding
scraping https://www.chess.com/forum/category/general?page=801
added 20 posts
16020 collected
Oct 03, 2020, 06:35:52 PM > Mar 12, 2020, 12:00:00 AM: proceeding
scraping https://www.chess.com/forum/category/general?page=901
added 20 posts
18020 collected
Aug 20, 2020, 12:26:21 AM > Mar 12, 2020, 12:00:00 AM: proceeding
scraping https://www.chess.com/forum/category/general?page=1001
added 20 posts
20020 collected
Jul 03, 2020, 04:52:57 AM > Mar 12, 2020, 12:00:00 AM: proceeding
scraping https://www.chess.com/forum/category/general?page=1101
added 20 posts
22020 collected
May 17, 2020, 12:05:01 PM > Mar 12, 2020, 12:00:00 AM: proceeding
scraping https://www.chess.com/forum/category/general?page=1201
added 20 posts
24020 collected
Mar 25, 2020, 01:40:22 AM > Mar 12, 2020, 12:00:00 AM: proceeding
scraping https://www.chess.com/forum/category/general?page=1301
added 20 posts
26020 collected

post_links_df = pd.read_csv("data/forum_page_links.csv", index_col=0)
print("I've collected {} posts!".format(len(post_links_df)))

I've collected 26380 posts!

# function for getting links to full posts
def get_data_from_posts(links):
    # make a container to hold and work with collected data
    rows = []
    total = len(links)
    for i, link in enumerate(links[:]['link']):
        time.sleep(2)
        print_output = len(rows) % 2000 == 0
        soup, _ = get_soup(link, None, print_output)
        if print_output:
            print(
                "{}/{}".format(
                    i + 1,
                    total
                )
            )
        # grab all comments
        comments = soup.find_all('div', class_="comment-post-component")
        for comment in comments:
            try:
                text = comment.find('p').text
            except:
                text = ""
            try:
                # try to obtain a timestamp
                timestamp_string = comment.find('span', class_="comment-post-actions-time").find('span')['title']
                timestamp = datetime.strptime(timestamp_string, "%b %d, %Y, %I:%M:%S %p")
            except:
                # if cannot, use np.nan
                timestamp = np.nan
            rows.append({
                "text": text,
                "timestamp": timestamp
            })
    post_data = pd.DataFrame(rows, index = range(len(rows)))
    return post_data

data = get_data_from_posts(post_links_df)
data.to_csv("data/data.csv")

scraping https://www.chess.com/forum/view/general/how-to-contact-support
1/26380
scraping https://www.chess.com/forum/view/general/why-are-people-being-mean-to-me-lately
712/26380
scraping https://www.chess.com/forum/view/general/prove-it
2557/26380
scraping https://www.chess.com/forum/view/general/flagging-is-it-unethical-or-part-of-the-game
2806/26380
scraping https://www.chess.com/forum/view/general/https-rumble-com-vfv1p3-its-back-to-the-board-for-chess-grandmasters-html
5103/26380
scraping https://www.chess.com/forum/view/general/chess-survey-mental-speed-study
6704/26380
scraping https://www.chess.com/forum/view/general/is-chess-finally-dead
7562/26380
scraping https://www.chess.com/forum/view/general/why-did-bobby-fischer-quit-chess
8135/26380
scraping https://www.chess.com/forum/view/general/bot-ratings-and-improvements
10280/26380
scraping https://www.chess.com/forum/view/general/wait-what-1
14065/26380
scraping https://www.chess.com/forum/view/general/disabled-chess-players-out-there
15583/26380
scraping https://www.chess.com/forum/view/general/what-do-you-guys-think-about-the-wijk-aan-zee-2013-tournament
16187/26380
scraping https://www.chess.com/forum/view/general/wrong-threefold-repetition
20522/26380
scraping https://www.chess.com/forum/view/general/can-we-see-are-pending-requests
20844/26380
scraping https://www.chess.com/forum/view/general/probleem-onvoldoende-materiaal
25044/26380

data = pd.read_csv("data/data.csv", index_col=0)
print("I've collected {} data points!".format(len(data)))

I've collected 175817 data points!

data[400:440]

print(data[data.eq('2021-06-19 05:40:54').any(1)])
print(data[data.eq('2021-06-10 07:58:06').any(1)])

    text           timestamp
400    ‎ 2021-06-19 05:40:54
    text           timestamp
438  NaN 2021-06-10 07:58:06

# Import requisite packages
# setup global variables
# and read in data
from datetime import datetime
import pandas as pd
import numpy as np
# setup earliest date of interest in this study
stop_date = datetime(2020, 3, 12)
# read in data
data = pd.read_csv("data/data.csv", index_col=0)

# convert timestamp strings to datetime objects
data['timestamp'] = pd.to_datetime(data['timestamp'], format="%Y-%m-%d %H:%M:%S")

# code modified from https://stackoverflow.com/a/21942746
# replace field that's entirely white space (or empty) with NaN
cleaned_data_1 = data.replace(r'^\s*$', np.nan, regex=True).copy()

# code modified from https://stackoverflow.com/a/64837795
# remove all rows where text is 1 character in length
cleaned_data_2 = cleaned_data_1[np.where((cleaned_data_1['text'].str.len() == 1), False, True)].copy()

# remove missing data
cleaned_data_3 = cleaned_data_2.dropna().copy()

# sort data by timestamp
cleaned_data_4 = cleaned_data_3.sort_values('timestamp', 0).copy()

# remove all data from outside our chosen time window
cleaned_data_5 = cleaned_data_4[np.where((cleaned_data_4['timestamp'] < stop_date), False, True)].copy()
cleaned_data_5.to_csv("data/data_cleaned.csv")

data_cleaned = pd.read_csv("data/data_cleaned.csv", index_col=0)
print("I now have {} data points!".format(len(data_cleaned)))

I now have 123032 data points!

print(data_cleaned[data_cleaned.eq('2021-06-19 05:40:54').any(1)])
print(data_cleaned[data_cleaned.eq('2021-06-10 07:58:06').any(1)])

Empty DataFrame
Columns: [text, timestamp]
Index: []
Empty DataFrame
Columns: [text, timestamp]
Index: []

# Import requisite packages
# and read in data
from datetime import datetime
import pandas as pd
import numpy as np
import nltk
# read in data
# NOTE: you will need to run the cells from the "Cleaning the data" section
# above in order to read in this data.
# If you do not, this cell the the following cells will throw errors
# when you attempt to run them.
clean_data = pd.read_csv("data/data_cleaned.csv", index_col=0)

# code modified from https://datascience.stackexchange.com/a/68002
tokens = clean_data.apply(lambda row: nltk.word_tokenize(row.iloc[0]), axis=1).copy()
clean_data['tokens'] = tokens.copy()

clean_data.head()

# make a row for each token with a copy of data from originating row
data_exploded = clean_data.explode('tokens')
# make a new dataframe of our data flattened out
# tokens are indexed by timestamp and sub-indexed by order of occurrence within a post
data_tokenized = pd.DataFrame(data_exploded['tokens'].to_list(),
                              index=[
                                  data_exploded['timestamp'],
                                  [item for sublist in clean_data['tokens'] for item in range(len(sublist))]
                              ],
                              columns=['tokens'])
data_tokenized.to_csv("data/data_tokenized.csv")

data_tokenized.iloc[46:63]

# Import requisite packages
# setup global variables
# and read in data
from datetime import datetime
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
import re, string
from nltk.corpus import stopwords
data_to_normalize = pd.read_csv("data/data_tokenized.csv", index_col=[0, 1])

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zacbolton/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/zacbolton/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/numpy/lib/arraysetops.py:583: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  mask |= (ar1 == a)

# code modified from https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk
def remove_noise(tokens, stop_words = ()):

    cleaned_tokens = []

    # pos_tag does not work on every tokenized string
    # '\' seems to cause it to fail.
    # if pos_tag fails, return a flag, so the superroutine
    # can react accordingly
    try:
        tags = pos_tag(tokens)
    except:
        return False

    for token, tag in tags:
        # remove punctuation or emails
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        # stem tokens
        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        # remove if empty, punctuation, or stop-word
        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

# create containers for columns and indices
time_index = []
pos_index = []
normalized_tokens = []

# loop through data by first hierarchical index (post)
# remove noise from tokens, and add to containers
for time, tokens in data_to_normalize.groupby(level=0):
    tokens = remove_noise(tokens['tokens'], stopwords.words('english'))
    if tokens == False:
        pass
    else:
        time_index.extend([time for x in range(len(tokens))])
        pos_index.extend([x for x in range(len(tokens))])
        normalized_tokens.extend(tokens)

data_normalized = pd.DataFrame({'tokens': normalized_tokens}, index=[time_index, pos_index])

data_normalized.to_csv("data/data_normalized.csv")

data_normalized.iloc[24:37]

# Import requisite packages
# setup global variables
# and read in data
import pandas as pd
import numpy as np
import nltk
from nltk.tag import pos_tag
import re, string
from nltk.stem.wordnet import WordNetLemmatizer
data_normalized = pd.read_csv("data/data_normalized.csv", index_col=[0, 1])

/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages/numpy/lib/arraysetops.py:583: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  mask |= (ar1 == a)

# Female verbiage to filter data by
female_words = set(
    remove_noise(
        [
            'wgm',
            'wfm',
            'wim',
            'wcm',
            'her',
            'hers',
            'she',
            'girl',
            'woman',
            'women',
            'girls',
            'female',
            'females',
            'lady',
            'ladies',
            'wife',
            'sister',
            'mother',
            'daughter',
            'niece',
            'girlfriend',
            "women's",
            "woman's",
            'gal',
            'dame',
            'lass',
            'chick'
        ]
    )
)

# create containers for columns and indices
time_index = []
pos_index = []
normalized_tokens = []

# again, loop through data by first hierarchical index (post)
# if it is pairwise disjoint from our set of female verbiage
# do nothing, otherwise save it to containers
for time, tokens in data_normalized.groupby(level=0):
    words = tokens.values.flat
    if (not set(words).isdisjoint(female_words)):
        # if the post contains at least one of the words in female_words, keep it
        times = [time for x in range(len(tokens))]
        time_index.extend(times)

        positions = [x for x in range(len(tokens))]
        pos_index.extend(positions)

        normalized_tokens.extend(tokens.values.flat)

posts_containing_female_words = pd.DataFrame({'tokens': normalized_tokens}, index=[time_index, pos_index])

posts_containing_female_words.to_csv("data/posts_containing_female_words.csv")

posts_containing_female_words.iloc[0:37]

# Import requisite packages
# setup global variables
# and read in data
import pandas as pd
import numpy as np
from nltk.sentiment import SentimentIntensityAnalyzer
posts_containing_female_words = pd.read_csv("data/posts_containing_female_words.csv", index_col=[0, 1])

# Code modified from https://realpython.com/python-nltk-sentiment-analysis/#using-nltks-pre-trained-sentiment-analyzer
# instantiate VADER
sia = SentimentIntensityAnalyzer()

# setup containers for each series
pos_scores = []
neg_scores = []
neu_scores = []
compound_scores = []
times = []
# save tokens in two different containers
# based on whether they were found in posts
# before or after the airing of the show
# for wordcloud generation
before_tokens = []
after_tokens = []

# again, loop through data by first hierarchical index (post)
# convert tokens back to a string, feed through VADER,
# and save results in containers
for time, tokens in posts_containing_female_words.groupby(level=0):
    if time < '2020-10-23':
        before_tokens.extend(tokens.values.flat)
    else:
        after_tokens.extend(tokens.values.flat)
    post_stringified = " ".join(tokens.values.flat)
    scores = sia.polarity_scores(post_stringified)
    pos_scores.append(scores['pos'])
    neg_scores.append(scores['neg'])
    neu_scores.append(scores['neu'])
    compound_scores.append(scores['compound'])
    times.append(time)

# convert containers to pandas DataFrames and save as CSVs
positive_series = pd.DataFrame(pos_scores, index=times, columns=['scores'])
positive_series.index.name = 'times'
positive_series.to_csv("data/series/positive_series.csv")

negative_series = pd.DataFrame(neg_scores, index=times, columns=['scores'])
negative_series.index.name = 'times'
negative_series.to_csv("data/series/negative_series.csv")

neutral_series = pd.DataFrame(neu_scores, index=times, columns=['scores'])
neutral_series.index.name = 'times'
neutral_series.to_csv("data/series/neutral_series.csv")

compound_series = pd.DataFrame(compound_scores, index=times, columns=['scores'])
compound_series.index.name = 'times'
compound_series.to_csv("data/series/compound_series.csv")

# save tokens for word clouds
before_tokens = pd.Series(before_tokens)
before_tokens.to_csv("data/wordclouds/before.csv")

after_tokens = pd.Series(after_tokens)
after_tokens.to_csv("data/wordclouds/after.csv")

positive_series.head()

negative_series.head()

neutral_series.head()

compound_series.head()

# Import requisite packages
# setup global variables
# and read in data
import pandas as pd
import numpy as np
positive_series = pd.read_csv("data/series/positive_series.csv", index_col=0)
negative_series = pd.read_csv("data/series/negative_series.csv", index_col=0)
neutral_series = pd.read_csv("data/series/neutral_series.csv", index_col=0)
compound_series = pd.read_csv("data/series/compound_series.csv", index_col=0)

positive_series.describe()

negative_series.describe()

neutral_series.describe()

compound_series.describe()

# get the max of our positive series
psmax = positive_series.max()
# get the mean of our positive series
psmean = positive_series.mean()
# get the standard deviation of our positive series
psstd = positive_series.std()
print(
    "Our maximum positive score is {:.1f} standard deviations from the mean!".format(
        (
            psmax[0] - psmean[0]
        ) / psstd[0]
    )
)

Our maximum positive score is 3.4 standard deviations from the mean!

# function to remove posts with a sentiment
# score that is over 2 standard deviations
# from the mean
def trim_series(series, title):
    std = series.std()[0]
    mean = series.mean()[0]
    upper_cutoff = mean + std * 2.0
    lower_cutoff = mean - std * 2.0
    series2 = series[series['scores'] < upper_cutoff].copy()
    series3 = series2[series2['scores'] > lower_cutoff].copy()
    series3.to_csv("data/series/{}_series_trimmed.csv".format(title))

trim_series(positive_series, "positive")
trim_series(negative_series, "negative")
trim_series(neutral_series, "neutral")
trim_series(compound_series, "compound")

positive_series_trimmed = pd.read_csv("data/series/positive_series_trimmed.csv", index_col=0)
negative_series_trimmed = pd.read_csv("data/series/negative_series_trimmed.csv", index_col=0)
neutral_series_trimmed = pd.read_csv("data/series/neutral_series_trimmed.csv", index_col=0)
compound_series_trimmed = pd.read_csv("data/series/compound_series_trimmed.csv", index_col=0)

pstmax = positive_series_trimmed.max()
pstmean = positive_series_trimmed.mean()
pststd = positive_series_trimmed.std()
print(
    "Our maximum positive score is now only {:.1f} standard deviations from the mean!".format(
        (
            pstmax[0] - pstmean[0]
        ) / pststd[0]
    )
)

Our maximum positive score is now only 2.3 standard deviations from the mean!

# Import requisite packages
# setup global variables
# and read in data
import pandas as pd
from matplotlib import pyplot
from wordcloud import WordCloud
before_tokens = pd.read_csv("data/wordclouds/before.csv", index_col=0)
after_tokens = pd.read_csv("data/wordclouds/after.csv", index_col=0)

def generate_word_cloud(tokens, title):
    # container string to be input to WordCloud
    words_in = ""
    # loop through tokens and extend words_in string
    for token in tokens.values:
        words_in += token[0] + " "
    # instantiate WordCloud with input string and display arguments set
    wordcloud = WordCloud(width = 1300, height = 600,
                    background_color ='white',
                    min_font_size = 12).generate(words_in)
    # adjust and display WordCloud with pyplot
    pyplot.figure(figsize = (13, 6), facecolor = None)
    pyplot.imshow(wordcloud)
    pyplot.axis("off")
    pyplot.tight_layout(pad = 0)
    pyplot.title(title)
    # save PNG
    pyplot.savefig("figures/wordcloud_{}.png".format(title))

generate_word_cloud(before_tokens, "BEFORE")
generate_word_cloud(after_tokens, "AFTER")

# Import requisite packages
# setup global variables
# and read in data
from datetime import datetime
import pandas as pd
import numpy as np
from matplotlib import pyplot
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
positive_series_trimmed = pd.read_csv("data/series/positive_series_trimmed.csv", index_col=0)
negative_series_trimmed = pd.read_csv("data/series/negative_series_trimmed.csv", index_col=0)
neutral_series_trimmed = pd.read_csv("data/series/neutral_series_trimmed.csv", index_col=0)
compound_series_trimmed = pd.read_csv("data/series/compound_series_trimmed.csv", index_col=0)

def plot(series, title, poly_degree, y_range=None):
    # code retrieved from https://stackoverflow.com/a/53474181
    series.index = pd.to_datetime(series.index, format="%Y-%m-%d %H:%M:%S")

    # score magnitudes on the Y axis
    y_values = series.loc[:, "scores"]
    # date and time of post on the X axis
    x_values = np.linspace(0,1,len(series.loc[:, "scores"]))

    # run NumPy's built in least squares polynomial fit algorithm
    # with the polynomial degree determined by input, poly_degree
    coeffs = np.polyfit(x_values, y_values, poly_degree)
    poly_eqn = np.poly1d(coeffs)
    y_hat = poly_eqn(x_values)

    # instantiate our plot
    plot_out = series.plot(style='k.')
    # provide values to plot
    plot_out.plot(series.index, series.loc[:,"scores"], "ro")
    plot_out.plot(series.index,y_hat)
    # assign plot title
    pyplot.title(title)
    # highlight date of airing of The Queen's Gambit
    pyplot.axvline(x=datetime.strptime('2020-10-23', "%Y-%m-%d"), color="purple")
    # enlarge plots for easy viewing
    pyplotfig = pyplot.gcf()
    pyplotfig.set_size_inches(18.5, 8)
    # assign axis labels
    pyplot.ylabel('SCORE')
    pyplot.xlabel('TIME')
    # if no range provided as input
    # let matplotlib assign
    if y_range != None:
        pyplot.ylim(y_range)
    # save plots as PNGs
    pyplot.savefig("figures/{}.png".format(title))

poly_degree = 8

plot(positive_series_trimmed, 'POSITIVE', poly_degree, (0,1))
plot(negative_series_trimmed, 'NEGATIVE', poly_degree, (0,1))
plot(neutral_series_trimmed, 'NEUTRAL', poly_degree, (0,1))
plot(compound_series_trimmed, 'COMPOUND', poly_degree, (-1,1))

	text	timestamp
400	‎	2021-06-19 05:40:54
401	we need to contact chess.com staff and issue a...	2021-06-19 07:32:51
402	Its so pathetic that they don't tell you you a...	2021-06-19 07:40:06
403	A single Mod dictator rules over LC . Break To...	2021-06-19 07:42:12
404	A single Mod dictator rules over LC . Break To...	2021-06-19 07:46:19
405	I realize I’m not the sharpest tool in the she...	2021-06-19 08:12:17
406	Its so pathetic that they don't tell you you a...	2021-06-19 08:14:01
407	we can be shadow ban brothers till the end of ...	2021-06-19 08:14:15
408	hhhello	2021-06-19 08:14:25
409	i got shadow banned	2021-06-19 08:20:49
410	we are all shadow ban brothers then	2021-06-19 08:21:14
411	we will rise up	2021-06-19 08:21:20
412	pray to our leder erik	2021-06-19 08:21:27
413	and when the time comes we will storm the lich...	2021-06-19 08:21:48
414	THE CULT OF ERIK	2021-06-19 08:21:56
415	I have seen a lot of advice for beginners that...	2021-05-30 09:15:00
416	Reasons i dont castle, if the queens come off ...	2021-05-30 09:42:45
417	There is no "one size fits all" answer. Castle...	2021-05-30 09:57:57
418	If you know your openings then you know when i...	2021-05-30 10:12:57
419	When u play the bongcloud	2021-05-30 10:20:10
420	To add a little nuance I take a different appr...	2021-05-30 10:57:56
421	Castling as quickly as possible leads to a saf...	2021-05-30 11:57:39
422	I found this series by Gotham chess on prevent...	2021-06-01 01:44:10
423	When the center is closed, sometimes it's bett...	2021-06-01 02:00:12
424	Castling is nearly always good. O-O is like 3 ...	2021-06-01 02:05:54
425	I have been looking through GM games. It is ra...	2021-06-01 02:21:05
426	You probably shouldn't castle when there is a ...	2021-06-01 03:35:07
427	@KevinOSh very perceptive!	2021-06-01 03:48:24
428	Just watched this video covering the game Sieg...	2021-06-02 03:51:28
429	I usually castle when possible, but i suck at ...	2021-06-02 04:05:48
430	When the centre is closed you don't need to ru...	2021-06-02 05:44:07
431	I usually castle when possible, but i suck at ...	2021-06-02 07:46:32
432	dont castle if:	2021-06-02 07:49:05
433	Reasons i dont castle, if the queens come off ...	2021-06-02 08:02:36
434	‘Good players seldom castle’	2021-06-02 08:09:32
435	How do you do this and make this in your descr...	2021-06-10 07:38:24
436	hi	2021-06-10 07:57:27
437	hi	2021-06-10 07:57:43
438	NaN	2021-06-10 07:58:06
439	how do you do that?	2021-06-10 07:58:35

	text	timestamp	tokens
175718	How much is average fees for local chess tourn...	2020-03-12 00:03:31	[How, much, is, average, fees, for, local, che...
175726	A good looking chess piece.	2020-03-12 00:25:31	[A, good, looking, chess, piece, .]
175728	Nice!	2020-03-12 00:26:44	[Nice, !]
175729	Which one?	2020-03-12 00:27:14	[Which, one, ?]
175730	Let’s play!	2020-03-12 00:27:28	[Let, ’, s, play, !]

		tokens
timestamp
2020-03-12 00:27:28	0	Let
	1	’
	2	s
	3	play
	4	!
2020-03-12 00:47:22	0	Мне
	1	тоже
	2	прислали
	3	!
2020-03-12 01:05:32	0	They
	1	did
	2	not
	3	have
	4	the
	5	resources
	6	,
	7	engines

		tokens
2020-03-12 00:27:28	0	let
	1	’
	2	play
2020-03-12 00:47:22	0	мне
	1	тоже
	2	прислали
2020-03-12 01:05:32	0	resource
	1	engine
	2	internet
	3	study
	4	use
	5	write
	6	....

	scores
times
2020-03-14 04:08:19	0.270
2020-03-19 17:01:18	0.330
2020-03-21 17:47:59	0.135
2020-03-22 16:33:24	0.359
2020-03-23 14:34:20	0.511

The Queen's Gambit: A Machine Learning Analysis of the Chess Community's Attitude After the Hit Netflix Show ¶

Did The Queen's Gambit Change the Chess Community's Attitude toward Women?¶

An analysis of sentiment in posts regarding women from chess.com before and after the airing of the hit Netflix show¶

My strategy¶

Retrieving the data¶

Cleaning the data¶

Tokenizing the data¶

Normalizing the data¶

Finding female words¶

Performing Sentiment Analysis¶

Removing outliers¶

Plotting the data¶

Using a word cloud¶

Plotting sentiment vs. time¶

Conclusion¶

Interpreting the data¶

Problems with my data and methedology¶

Ways to extend and improve this study¶

On the ethics of my data collection from chess.com¶

Similar studies¶

Bibliography¶

		tokens
2020-03-14 04:08:19	0	take
	1	mine
	2	local
	3	bar
	4	use
	5	pick
	6	chick
	7	know
	8	excited
	9	girl
	10	get
	11	find
	12	play
	13	chess
	14	...
2020-03-19 17:01:18	0	fine
	1	start
	2	look
	3	game
	4	one
	5	played
	6	person
	7	also
	8	1000s
	9	make
	10	75
	11	best
	12	move
	13	possible
	14	accidently
	15	play
	16	daughter
	17	new
	18	account
	19	instantly
	20	get
	21	bump

	scores
times
2020-03-14 04:08:19	0.000
2020-03-19 17:01:18	0.136
2020-03-21 17:47:59	0.074
2020-03-22 16:33:24	0.000
2020-03-23 14:34:20	0.000

	scores
times
2020-03-14 04:08:19	0.730
2020-03-19 17:01:18	0.534
2020-03-21 17:47:59	0.791
2020-03-22 16:33:24	0.641
2020-03-23 14:34:20	0.489

	scores
times
2020-03-14 04:08:19	0.5859
2020-03-19 17:01:18	0.9559
2020-03-21 17:47:59	0.3182
2020-03-22 16:33:24	0.9423
2020-03-23 14:34:20	0.8555

	scores
count	649.000000
mean	0.203746
std	0.185354
min	0.000000
25%	0.000000
50%	0.188000
75%	0.327000
max	0.828000