import os
import requests
import shutil
import zipfile

data_dir = './WCA_data'
data_zip = f'{data_dir}.zip'

if os.path.isdir(data_dir):
    # Delete the existing data
    shutil.rmtree(data_dir)

if os.path.exists(data_zip):
    # Delete the zipped data
    os.remove(data_zip)

# Get data from WCA
url = 'https://www.worldcubeassociation.org/results/misc/WCA_export.tsv.zip'
r = requests.get(url, stream=True)

with open(data_zip, 'wb') as f:
    f.write(r.content)
            
# Extract zipped data
with zipfile.ZipFile(data_zip, 'r') as zf:
    zf.extractall(data_dir)


import numpy as np
import pandas as pd

df_avg = pd.read_csv(f'{data_dir}/WCA_export_RanksAverage.tsv', sep='\t', low_memory=False)
df_avg.head(5)


# Drop all events that we aren't interested in
df_avg.drop(df_avg[(df_avg.eventId < '444') | (df_avg.eventId > '777')].index, inplace=True)

# Only keep competitors with averages in all events we're interested in (4x4 through 7x7)
tmp = pd.merge(df_avg, df_avg, on='personId', suffixes=('_4', '_5'))
tmp.drop(tmp[(tmp.eventId_4 != '444') | (tmp.eventId_5 != '555')].index, inplace=True)
tmp = pd.merge(tmp, df_avg, on='personId')
tmp.drop(tmp[(tmp.eventId_4 != '444') | (tmp.eventId_5 != '555') | 
             (tmp.eventId != '666')].index, inplace=True)
tmp = pd.merge(tmp, df_avg, on='personId', suffixes=('_6', '_7'))
df_grouped = tmp.drop(tmp[(tmp.eventId_4 != '444') | (tmp.eventId_5 != '555') | 
                          (tmp.eventId_6 != '666') | (tmp.eventId_7 != '777')].index)
df_grouped.reset_index(drop=True, inplace=True)

# Drop columns that won't be used going forward
cols = ['eventId', 'continentRank', 'countryRank']
df_grouped.drop([f'{col}_{n}' for col in cols for n in range(4, 8)], 
                axis = 1, inplace=True)

# Fix time scale
for n in range(4, 8):
    df_grouped[f'best_{n}'] = np.float64(df_grouped[f'best_{n}'] / 100.0)

df_grouped


import matplotlib.pyplot as plt

fig, ax = plt.subplots()
plt.rcParams["figure.figsize"] = (12,7.5) # Create plot and size it
events = ['4x4', '5x5', '6x6', '7x7'] # Names for big cube events

# Create array with time data for each event
time_array = pd.DataFrame().assign(four = df_grouped['best_4'],
                                  five = df_grouped['best_5'],
                                  six = df_grouped['best_6'],
                                  seven = df_grouped['best_7'])

# Make and label violin plot
ax.violinplot(time_array,range(0,4,1),showmeans=True)
ax.set_xticks(np.arange(0,4,1), events)
ax.set_xlabel("Event")
ax.set_ylabel("Time To Solve (seconds)")
ax.set_title("Distribution by Event")

Text(0.5, 1.0, 'Distribution by Event')


def create_scatter(df):
    # Randomized rows to look at sample if desired
    df_shuffled = df.sample(frac=1).reset_index(drop=True)

    # Create average 4x4 & 5x5 ranks and average 6x6 & 7x7 ranks
    combined_45 = (df_shuffled['worldRank_4'] + df_shuffled['worldRank_5']) / 2
    combined_67 = (df_shuffled['worldRank_6'] + df_shuffled['worldRank_7']) / 2

    plt.rcParams["figure.figsize"] = (12,7.5)

    # Plot Average 4x4 & 5x5 Rank vs Average 6x6 & 7x7 Rank
    sample_size = len(combined_45)  # Insert desired sample size to observe
    x = combined_45[:sample_size]
    y = combined_67[:sample_size]
    plt.scatter(x, y)
    plt.xlim(max(x), 0)
    plt.ylim(max(y), 0)
    plt.plot([max(x), 0], [max(x), 0], color='r') # plots line y = x
    plt.xlabel('Average 4x4 & 5x5 Rank')
    plt.ylabel('Average 6x6 & 7x7 Rank')
    plt.title('Average 4x4 & 5x5 Rank vs Average 6x6 & 7x7 Rank')

create_scatter(df_grouped)
plt.show()


df_ranked = df_grouped.copy()

def contained_rank(n):
    # Reassign rankings for our dataset for NxN
    df_ranked.sort_values(by=f'worldRank_{n}', inplace=True)
    df_ranked.reset_index(drop=True, inplace=True)
    for i, row in df_ranked.iterrows():
        df_ranked.loc[i, f'worldRank_{n}'] = i + 1

for n in range(7, 3, -1):
    contained_rank(n)

df_ranked


def mark_id(wca_id, name, color, size):
    # Marking competitor in the plot with a specified color and size
    df = df_ranked.loc[df_ranked['personId'] == wca_id]
    combined_45 = (df['worldRank_4'] + df['worldRank_5']) / 2
    combined_67 = (df['worldRank_6'] + df['worldRank_7']) / 2
    plt.plot(combined_45, combined_67, f'{color}o', ms=size)
    plt.text(combined_45, combined_67, name)

create_scatter(df_ranked)
mark_id('2013PARK03', 'Andrew Park', 'r', 10)
plt.show()


from sklearn.model_selection import train_test_split  
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error


df_shuffled = df_ranked.sample(frac=1).reset_index(drop=True)
# Couple our data into 2 groups for prediction scheme
# Create group A using 4x4 & 5x5 ranks
groupA_X = pd.DataFrame().assign(four=df_shuffled['worldRank_4'], five=df_shuffled['worldRank_5'])
# Create group B using 6x6 & 7x7 ranks
groupB_X = pd.DataFrame().assign(six=df_shuffled['worldRank_6'], seven=df_shuffled['worldRank_7'])

prediction_percent_error = []  # Array used to visualize percent error for each prediction

for rank in ['worldRank_6', 'worldRank_7']: # Use group A to predict through linear regression 6x6 & 7x7 results
    X_train, X_test, y_train, y_test = train_test_split(groupA_X, df_shuffled[rank], test_size=0.4, random_state=0)
    reg = LinearRegression().fit(X_train, y_train)
    prediction_percent_error.append(mean_absolute_percentage_error(y_test, reg.predict(X_test)))
    
for rank in ['worldRank_4', 'worldRank_5']: # Use group B to predict through linear regression 4x4 & 5x5 results
    X_train, X_test, y_train, y_test = train_test_split(groupB_X, df_shuffled[rank], test_size=0.4, random_state=0)
    reg = LinearRegression().fit(X_train, y_train)
    prediction_percent_error.append(mean_absolute_percentage_error(y_test, reg.predict(X_test)))

names = ['4x4 Prediction', '5x5 Prediction', '6x6 Prediction', '7x7 Prediction'] # Names for plot
                      
#Create plot and adjust size
fig = plt.figure(figsize =(10, 7))
 
# Horizontal Bar Plot
plt.bar(names, prediction_percent_error)
plt.xlabel('Predictions')
plt.ylabel('Percent Error')
plt.title('Percent error for each set of predictions')
 
# Show Plot
plt.show()

	personId	eventId	best	worldRank	continentRank	countryRank
0	2018KHAN28	222	102	1	1	1
1	2013EGDA02	222	121	2	1	1
2	2012CALL01	222	123	3	2	2
3	2016LIJI05	222	125	4	1	1
4	2012PATE01	222	127	5	2	1

	personId	best_4	worldRank_4	best_5	worldRank_5	best_6	worldRank_6	best_7	worldRank_7
0	2012PARK03	19.88	1	38.45	1	75.63	1	106.57	1
1	2010WEYE02	21.46	2	45.80	8	87.19	8	154.27	79
2	2009ZEMD01	21.57	3	42.09	3	81.90	5	120.63	4
3	2016KOLA02	22.00	4	39.79	2	80.36	3	122.46	5
4	2012PONC02	23.12	5	47.82	12	98.55	39	158.33	102
...	...	...	...	...	...	...	...	...	...
4240	2008VILL01	118.97	27206	182.45	13210	330.10	5725	557.57	4771
4241	2009KIVI01	123.89	27629	196.52	13578	340.73	5790	529.40	4727
4242	2011SANG02	139.73	28592	223.38	14052	541.09	6116	782.00	4842
4243	2008FERN03	146.66	28870	203.17	13733	511.27	6103	718.00	4831
4244	2013HERM02	153.12	29079	231.48	14123	634.00	6128	738.00	4838

	personId	best_4	worldRank_4	best_5	worldRank_5	best_6	worldRank_6	best_7	worldRank_7
0	2012PARK03	19.88	1	38.45	1	75.63	1	106.57	1
1	2010WEYE02	21.46	2	45.80	8	87.19	8	154.27	78
2	2009ZEMD01	21.57	3	42.09	3	81.90	5	120.63	4
3	2016KOLA02	22.00	4	39.79	2	80.36	3	122.46	5
4	2012PONC02	23.12	5	47.82	12	98.55	39	158.33	101
...	...	...	...	...	...	...	...	...	...
4240	2008VILL01	118.97	4241	182.45	4226	330.10	4151	557.57	4194
4241	2009KIVI01	123.89	4242	196.52	4230	340.73	4167	529.40	4165
4242	2011SANG02	139.73	4243	223.38	4239	541.09	4243	782.00	4239
4243	2008FERN03	146.66	4244	203.17	4233	511.27	4241	718.00	4230
4244	2013HERM02	153.12	4245	231.48	4241	634.00	4245	738.00	4236

Analyzing WCA Competitors' Relative Performances in Big Cube Events¶

Introduction¶

Data Collection¶

Data Representation¶

Data Analysis¶

Hypothesis Testing¶

Conclusion¶