import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.stats import chi2_contingency
import seaborn as sns
from scipy.stats import ttest_ind

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import mean_absolute_error

import torch
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertConfig, DistilBertModel

from tqdm import tqdm

import lightgbm as lgb

df = pd.read_csv("cyberpunk_2077_filtered.csv")
df.head()

# change column names
df.columns = ["language", "review", "date", "voted_up_for_game", "upvotes_on_comment", "funny_votes_on_comment", "steam_purchase", "hours_played"]

# convert data types
df["language"] = df["language"].astype(str)
df["review"] = df["review"].astype(str)
df["date"] = pd.to_datetime(df["date"])
#df["voted_up_to_game"] = df["voted_up_for_game"].astype(int)
#df["steam_purchase"] = df["steam_purchase"].astype(int)

# converting playtime_at_review to hours_played
df["hours_played"] = df["hours_played"] / 60
# dropping outliers ( > 3 std deviations from the mean)
df = df[df["hours_played"] < df["hours_played"].mean() + 3 * df["hours_played"].std()]

df.head()

df.info() #checking the info of the dataset

<class 'pandas.core.frame.DataFrame'>
Index: 606133 entries, 0 to 612379
Data columns (total 8 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   language                606133 non-null  object        
 1   review                  606133 non-null  object        
 2   date                    606133 non-null  datetime64[ns]
 3   voted_up_for_game       606133 non-null  bool          
 4   upvotes_on_comment      606133 non-null  int64         
 5   funny_votes_on_comment  606133 non-null  int64         
 6   steam_purchase          606133 non-null  bool          
 7   hours_played            606133 non-null  float64       
dtypes: bool(2), datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 33.5+ MB

upvote_rate = df.groupby([df['date'].dt.year, df['date'].dt.month])["voted_up_for_game"].mean() #here we are grouping and calculating the mean
upvote_rate.plot(kind="line", figsize=(15, 5), title="Upvote Rate over Time", xlabel="Date", ylabel="Upvote Rate") #plotting the graph
plt.show()

# Create a histogram
plt.hist(df['hours_played'], bins=int(df['hours_played'].max()/2))

# Add labels and title
plt.xlabel('Hours played at review')
plt.ylabel('Frequency')
plt.title('Histogram of hours played at Review')

# Show the histogram
plt.show()

# performing logistic regression

df["intercept"] = 1
print(df[["hours_played", "intercept"]].shape) #checking the shape of the dataset
logit_model = sm.Logit(df["voted_up_for_game"], df[["hours_played", "intercept"]]) #creating the logistic regression model
result = logit_model.fit() #fitting the model

print(result.summary())

(606133, 2)
Optimization terminated successfully.
         Current function value: 0.488026
         Iterations 6
                           Logit Regression Results                           
==============================================================================
Dep. Variable:      voted_up_for_game   No. Observations:               606133
Model:                          Logit   Df Residuals:                   606131
Method:                           MLE   Df Model:                            1
Date:                Tue, 07 May 2024   Pseudo R-squ.:                 0.01342
Time:                        23:10:19   Log-Likelihood:            -2.9581e+05
converged:                       True   LL-Null:                   -2.9983e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
================================================================================
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
hours_played     0.0065   7.86e-05     82.328      0.000       0.006       0.007
intercept        1.1316      0.004    253.720      0.000       1.123       1.140
================================================================================

playtime_range = np.arange(0, df['hours_played'].max() + 1, 1) #creating a range of playtime
probabilities = result.predict(np.array([playtime_range,np.ones(playtime_range.shape)]).T)
plt.plot(playtime_range, probabilities) #plotting the graph

step = 5
bins = pd.cut(df['hours_played'], bins=range(0, int(df['hours_played'].max()+step), step)) #creating bins
grouped_df = df.groupby(bins)['voted_up_for_game'].mean()
plt.plot(np.arange(0, df['hours_played'].max(), step), grouped_df, color='red') #plotting the graph


plt.xlabel('Hours Played at Review')
plt.ylabel('Probability of Upvote')
plt.legend(['Prediction from Logistic Regression', 'Actual Mean Upvote Rate'])
plt.title('Logistic Regression of Upvote Probability on Playtime at Review')
plt.show()

C:\Users\aryan\AppData\Local\Temp\ipykernel_12696\2930194458.py:7: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_df = df.groupby(bins)['voted_up_for_game'].mean()

contingency = pd.crosstab(df["voted_up_for_game"], df["steam_purchase"]) #creating a contingency table
result = chi2_contingency(contingency) #performing the chi-squared test
print(result)

Chi2ContingencyResult(statistic=1903.13689134948, pvalue=0.0, dof=1, expected_freq=array([[ 12564.97788439, 106176.02211561],
       [ 51575.02211561, 435816.97788439]]))

sns.heatmap(contingency, annot=True, cmap='coolwarm', fmt='d')

<Axes: xlabel='steam_purchase', ylabel='voted_up_for_game'>

#alll ascii art on steam have the braille symbols which have the unicode range of 2800 to 28FF according to manual inspection of the dataset and general knowledge
df['has_ascii_art'] = df['review'].apply(lambda x: True if any(c for c in x if ('\u2800' <= c <= '\u28FF')) else False)

ascii_art = df[df['has_ascii_art']]
no_ascii_art = df[~df['has_ascii_art']]
#performing a t-test
t_stat, p_val = ttest_ind(ascii_art["upvotes_on_comment"] + ascii_art["funny_votes_on_comment"], no_ascii_art['upvotes_on_comment']+ no_ascii_art['funny_votes_on_comment'], alternative='greater')

print(f"p-value: {p_val}")

p-value: 4.560562825877745e-24

sns.barplot(x='has_ascii_art', y='upvotes_on_comment', data=df, estimator=np.mean)
plt.title('Mean Upvotes by Presence of ASCII Art in the review')
plt.show()

sns.barplot(x='has_ascii_art', y='funny_votes_on_comment', data=df, estimator=np.mean)
plt.title('Mean Funny Votes by Presence of ASCII Art in the review')
plt.show()

df2 = df[['language', 'review', 'voted_up_for_game', 'hours_played', 'steam_purchase', 'has_ascii_art']]

df2 = df2.dropna() #remove rows that have missing values
df2 = df2[df2['review'] != ''] #remove rows that have empty strings because they are not useful
df2 = df2[df2['hours_played'] > 25] #remove rows that have less than 25 hours played as theyre useless
df2 = df2[df2['language'] == 'english'] #remove rows if language is not english in order to simplify the analysis
df2 = df2[df2['steam_purchase'] == True] #remove troll reviews
df2 = df2[df2['has_ascii_art'] == False] #we're not here to analyze ascii art but to analyze the reviews
df2 = df2[df2['review'].apply(lambda x: len(x.split()) > 25)] #remove rows that have less than 25 words as they are not useful
df2 = df2[~df2['review'].str.contains('█')] #remove rows that have the block character as they are not useful
df2 = df2[['review', 'voted_up_for_game']] #now keep only the columns that are useful for the model since we have removed the rest
df2['voted_up_for_game'] = df2['voted_up_for_game'].map({True: 1, False: 0}) #we now map the boolean values to 1 and 0
df2 = df2.dropna()
df2 = df2.drop_duplicates()

df2.head()

df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 67124 entries, 0 to 612267
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   review             67124 non-null  object
 1   voted_up_for_game  67124 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.5+ MB

#plot length of reviews
plt.hist(df2['review'].apply(len), bins=50)
plt.xlabel('Review Length')
plt.ylabel('Frequency')
plt.title('Length of reviews')
plt.show()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #while I will be using cuda, others may not have it so I have added this line

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2).to(device)
max_length = 512
batch_size = 8
optimizer = Adam(model.parameters(), lr=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=1, verbose=True)
epochs = 2 #this has been changed, we have experimented with this
training_loss = []
validation_loss = []

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

#A quick check if the model exists in the directory. If it does, we load it. If not, we train it. This is so that we don't have to train the model everytime we run the notebook.

try:
    model = DistilBertModel.from_pretrained(r'C:\Users\aryan\Documents\Homework\CMSC320\CMSC320-cyberpunk\320_final_model')
    model.eval()
    print('MODEL FOUND, DO NOT RUN THE TRAINING CELL BELOW')
except:
    print("MODEL NOT FOUND, TRAIN THE MODEL")

MODEL FOUND, DO NOT RUN THE TRAINING CELL BELOW

def tokenize(review):
    tokens = tokenizer.encode_plus(
        review,
        truncation=True, #truncating the tokens to the max length which is 512
        padding="max_length", #padding the tokens to the max length which is 512
        return_attention_mask=True, #attention mask is used to tell the model which tokens to pay attention to and which to ignore
        add_special_tokens=True, #this is for the special tokens that DistilBert uses
        max_length=max_length, #max length is 512
    )
    inputs = torch.tensor(tokens["input_ids"]).to(device) #converting the input ids to a tensor and moving it to GPU
    attmsk = torch.tensor(tokens["attention_mask"]).to(device) #converting the attention mask to a tensor and moving it to GPU
    return inputs, attmsk

X_train, X_test, y_train, y_test = train_test_split(df2['review'].values, df2['voted_up_for_game'].values, test_size=0.2, random_state=42) #splitting the data into training and testing data

inputs_X_train = [tokenize(review) for review in X_train] #tokeniizing the training data
inputs_X_test = [tokenize(review) for review in X_test] #tokenizing the testing data
X_train2 = torch.stack([inputs[0] for inputs in inputs_X_train]) #stacking the training data
X_test2 = torch.stack([inputs[0] for inputs in inputs_X_test]) #stacking the testing data
train_mask = torch.stack([inputs[1] for inputs in inputs_X_train]) #stacking the atention mask for the training data
test_mask = torch.stack([inputs[1] for inputs in inputs_X_test]) #stacking the attention mask for the testing data

y_train2 = torch.tensor(y_train, dtype=torch.long).to(device) #converting the training data to a tensor and moving it to GPU
y_test2 = torch.tensor(y_test, dtype=torch.long).to(device) #converting the testing data to a tensor and moving it to GPUq

train_data = TensorDataset(X_train2, train_mask, y_train2) #creating a tensor dataset for the training data, this is used to create a data loader
train_sampler = RandomSampler(train_data) #creating a random sampler for the training data, a random sampler is used to shuffle the data
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) #creating a data loader for the training data, a data loader is used to load the data in batches

#now we do the same for test

test_data = TensorDataset(X_test2, test_mask, y_test2)
test_sampler = SequentialSampler(test_data) #we use a sequential sampler instead of random sampler for the test data because we don't want to shuffle the test data
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

for epoch in range(epochs):

    model.train #set the model to training mode
    total_train_loss = 0 #initialize the total training loss to 0
    progress_bar = tqdm(train_dataloader, total=len(train_dataloader), leave=True, desc = f'epoch {epoch + 1}', position=0)

    for batch in progress_bar: #for each batch in the training data, we use progress bar to show the progress
        batch = tuple(t.to(device) for t in batch)
        input_ids = batch[0] #input ids
        mask = batch[1] #att mask
        labels = batch[2] #labels
        optimizer.zero_grad() #set the gradients to zero
        outputs = model(input_ids, attention_mask=mask, labels=labels) #get the outputs from the model
        loss = outputs.loss #now we get the loss
        total_train_loss += loss.item()  #add the loss to the total training loss
        loss.backward() #bakcpropagation, the best thing of this
        optimizer.step() #step the optimizer
        progress_bar.set_postfix({'Training loss': total_train_loss / len(train_dataloader)})

    training_loss.append(total_train_loss / len(train_dataloader)) #append the training loss to the training loss list

    model.eval() #set the model to evaluation mode
    total_test_loss = 0 #initialize the total test loss to 0
    progress_bar = tqdm(test_dataloader, leave=True, total=len(test_dataloader), desc = f'epoch {epoch + 1}', position=0)

    for batch in progress_bar:

        with torch.no_grad(): #we do this because we don't want to calculate the gradients for the test data
            batch = tuple(t.to(device) for t in batch)
            input_ids = batch[0]
            mask = batch[1]
            labels = batch[2]

            outputs = model(input_ids, attention_mask=mask, labels=labels) #get the outputs from the model
            loss = outputs.loss #get the loss
            total_test_loss += loss.item()
            #we dont backpropagate here because we are not training the model
            progress_bar.set_postfix({'Validation loss': total_test_loss / len(test_dataloader)})

    validation_loss.append(total_test_loss / len(test_dataloader))

    #now we print necessary information
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Training Loss: {training_loss[-1]}")
    print(f"Validation Loss: {validation_loss[-1]}")

epoch 1: 100%|██████████| 6713/6713 [1:02:29<00:00,  1.79it/s, Training loss=0.176] 
epoch 1: 100%|██████████| 1679/1679 [04:55<00:00,  5.68it/s, Validation loss=0.167]

Epoch 1/2
Training Loss: 0.1755063012321222
Validation Loss: 0.16723084198715893

epoch 2: 100%|██████████| 6713/6713 [1:02:11<00:00,  1.80it/s, Training loss=0.0899]
epoch 2: 100%|██████████| 1679/1679 [04:55<00:00,  5.68it/s, Validation loss=0.158]

Epoch 2/2
Training Loss: 0.08988753507845139
Validation Loss: 0.15837545822178786

model.save_pretrained('320_final_model')

plt.plot(training_loss, label='training Loss')
plt.plot(validation_loss, label='validation Loss')
plt.xlabel('Training')
plt.ylabel('loss')
plt.legend()
plt.show()

model.eval() #set the model to evaluation mode
actual = []
prediction = []
for batch in test_dataloader: #for each batch in the test data
    with torch.no_grad(): #we do this because we don't want to calculate the gradients for the test data
        batch = tuple(t.to(device) for t in batch)
        input_ids = batch[0]
        mask = batch[1]
        labels = batch[2]

        outputs = model(input_ids, attention_mask=mask)
        actual.extend(labels.cpu().numpy()) #add the actual values
        prediction.extend(torch.argmax(outputs.logits, axis=1).cpu().numpy()) #get the predictions

print(classification_report(actual, prediction)) #print the classification report

              precision    recall  f1-score   support

           0       0.84      0.83      0.83      2305
           1       0.96      0.97      0.97     11120

    accuracy                           0.94     13425
   macro avg       0.90      0.90      0.90     13425
weighted avg       0.94      0.94      0.94     13425

accuracy = accuracy_score(actual, prediction)
print(f"accuracy: {accuracy}")

accuracy: 0.942048417132216

matrix = confusion_matrix(actual, prediction)
plt.matshow(matrix)
plt.colorbar()
plt.xlabel('predicted values')
plt.ylabel('actual values')
plt.show()

df["language"].unique()

array(['english', 'french', 'schinese', 'koreana', 'spanish', 'brazilian',
       'russian', 'turkish', 'tchinese', 'german', 'italian',
       'portuguese', 'ukrainian', 'norwegian', 'polish', 'thai',
       'finnish', 'czech', 'danish', 'hungarian', 'dutch', 'latam',
       'swedish', 'vietnamese', 'japanese', 'romanian', 'indonesian',
       'greek', 'bulgarian'], dtype=object)

def preprocess(df):
    # combine both Chinese languages
    df['language'] = df['language'].replace('schinese', 'chinese')
    df['language'] = df['language'].replace('tchinese', 'chinese')
    
    # adding words_count
    # separating between asian language and other languages since Chinese, Korean, Japanese does not have spaces between words
    df.loc[df['language'].isin(['chinese', 'koreana', "japanese"]), 'words_count'] = df['review'].apply(lambda x: len(x))
    df.loc[~df['language'].isin(['chinese', 'koreana', "japanese"]), 'words_count'] = df['review'].apply(lambda x: len(x.split()))
    
    # adding days_since_updated
    df["days_since_updated"] = (df["date"].max() - df["date"]).dt.days
    
    # adding spoiler_inlcuded which checks if the review contains [spoiler] tag
    df["spoiler_included"] = df["review"].str.contains("[spoiler]", case=False)
    
    # removeing columns that are not needed
    df.drop(columns=["review","date","funny_votes_on_comment","intercept"], inplace=True)
    
    # changing the category of the language column
    df["language"] = df["language"].astype("category")
    
    return df

df_copy = df.copy()
preprocess(df_copy)
df_copy

target = df_copy["upvotes_on_comment"]
df_copy.drop(columns=["upvotes_on_comment"],inplace=True)

X_train, X_test, y_train, y_test = train_test_split(df_copy, target, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

d_train = lgb.Dataset(X_train, label=y_train)
d_val = lgb.Dataset(X_val, label=y_val)
params = {
    'objective':'regression_l1',
    'boosting':'gbdt',
    'verbose': 1,
    'seed': 42,
    'early_stopping_rounds':5,
    'learning_rate':0.3,
    'num_iterations':100,
    'num_leaves':100,
    'min_data_in_leaf':50,
    'min_child_weight':0,
    'min_split_gain':0,
    'device':'gpu',
    'min_data_in_bin':100
}
cate_features_name = ['language', "steam_purchase", 'has_ascii_art', 'spoiler_included', "voted_up_for_game"]
model = lgb.train(params, d_train, categorical_feature = cate_features_name, valid_sets = d_val)

c:\Users\Dahong Luo\.conda\envs\pytorch_env\lib\site-packages\lightgbm\engine.py:172: UserWarning: Found `num_iterations` in params. Will use it instead of argument
  _log_warning(f"Found `{alias}` in params. Will use it instead of argument")

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 791
[LightGBM] [Info] Number of data points in the train set: 387924, number of used features: 8
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3050 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 4 dense feature groups (1.48 MB) transferred to GPU in 0.004146 secs. 1 sparse feature groups
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[30]	valid_0's l1: 2.45309

y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)
y_pred_test = model.predict(X_test)

mae = mean_absolute_error(y_pred_train, y_train)
print('Training set loss (MAE):%.3f' % mae)

#Mean Absolute error (MAE) for validation set
mae = mean_absolute_error(y_pred_val, y_val)
print('Validation set loss (MAE):%.3f' % mae)

#Mean Absolute error (MAE) for test set
mae = mean_absolute_error(y_pred_test, y_test)
print('Test set loss (MAE):%.3f' % mae)

Training set loss (MAE):2.870
Validation set loss (MAE):2.453
Test set loss (MAE):2.956

error_gb_train = y_train - y_pred_train
error_gb_test = y_test - y_pred_test

plt.hist(error_gb_test, bins=15000, label='lightGBM')
plt.ylabel('Frequency')
plt.xlabel('upvotes count difference')
plt.legend()
plt.xlim(-10,10)
plt.show()

	review	voted_up_for_game
0	It's very fun. I don't usually like open world...	1
6	Coming back to try the game after 2.0 came out...	0
11	Todo valio la pena al final con el mejor endin...	1
18	Do you like Immersion, Becoming utterly lost a...	1
44	good game well worth a play. story gets kinda ...	1

Analysing Cyberpunk 2077 steam reviews¶

Spring 2024 Data Science Project¶

Project members¶

Why we chose this dataset?¶

Questions we have¶

Why is this important?¶

Contribution Checkpoints¶

Contribution Summary¶

Data Curation¶

Data Source¶

Dependencies¶

Creating a DataFrame¶

Description of the dataset¶

Exploratory Data Analysis¶

Data Preprocessing¶

Data Exploration¶

Upvote rate over time¶

Hours played statistics¶

Hypothesis Testing¶

Hypothesis 1: the longer the playtime, the more likely the review is positive¶

Hypothesis 2: Those who did not buy the game on steam are most likely trolls and will not vote up¶

Hypothesis 3: ASCII art = more upvotes¶

Conclusions¶

Primary Data Analysis¶

Part 1: Sentiment Analysis¶

Part 2: Upvotes prediction¶

Insights and Conclusions¶

	language	review	updated	voted_up	steam_purchase	playtime_at_review
0	english	It's very fun. I don't usually like open world...	2023-12-13	True	True	2452
1	french	loved it before 2.0\n\n\n\n\ndon't like it any...	2023-12-13	False	False	3349
2	schinese	真的值得体验一下	2023-12-13	True	True	2116
3	english	Fun game	2023-12-13	True	True	10324
4	english	cyberpunk	2023-12-13	True	True	1580