Analyze this csv data file. Dependent variable is tpstress (Total perceived stress). Predictor variables are tpcoiss (Total Perceived Control Internal States) and tmast (Total Mastery). I want to answer these 2 questions "How well do the two measures of control (mastery, PCOISS) predict perceived stress? How much variance in perceived stress scores can be explained by scores on these two scales?

In [15]:
import pandas as pd
import statsmodels.api as sm
import requests
from io import StringIO

# Now that we have a CSV file, let's load it and perform the multiple regression analysis.
# We will analyze the data to answer the two questions regarding the prediction of perceived stress.

# Downloading the CSV data from Dropbox
dropbox_link = 'https://www.dropbox.com/scl/fi/r52t1up2b764d9ozkgga4/survey3ED.csv?rlkey=47nc7liknev0yfkuov3a9d7jq&dl=1'
r = requests.get(dropbox_link)
data = StringIO(r.text)
csv_data = pd.read_csv(data)

# Display the first few rows of the dataframe to understand its structure
print(csv_data.head())

# Extracting the relevant columns for regression analysis
dependent_var = 'tpstress'  # Dependent variable
predictors = ['tpcoiss', 'tmast']  # Predictor variables

# Checking if the necessary columns are present in the dataframe
if not all(column in csv_data.columns for column in [dependent_var] + predictors):
    raise ValueError("The dataset does not contain all the required columns for the regression analysis.")

# Cleaning the data: converting all columns to numeric and handling non-numeric entries
# Any non-numeric values will be set to NaN and then those rows will be dropped
csv_data[predictors] = csv_data[predictors].apply(pd.to_numeric, errors='coerce')
csv_data[dependent_var] = pd.to_numeric(csv_data[dependent_var], errors='coerce')

# Dropping rows with NaN values in the predictor or dependent variables
clean_data = csv_data.dropna(subset=[dependent_var] + predictors)

# Defining the model with the cleaned data
X_clean = clean_data[predictors]
y_clean = clean_data[dependent_var]

# Adding a constant term for the intercept to the model
X_clean = sm.add_constant(X_clean)

# Fitting the regression model with the clean data
regression_model_clean = sm.OLS(y_clean, X_clean).fit()

# Getting the summary of the regression model to answer the questions
regression_summary_clean = regression_model_clean.summary()
print(regression_summary_clean)


    id  sex  age  marital child  educ source smoke smokenum op1  ... tslfest  \
0  415    2   24        4     1     5      7     2            3  ...      35   
1    9    1   39        3     1     5      1     1        2   2  ...      34   
2  425    2   48        4     1     2      4     2            3  ...      31   
3  307    1   41        5     1     2      1     2        0   3  ...      40   
4  440    1   23        1     2     5      1     2        0   3  ...      21   

  tmarlow tpcoiss agegp3 agegp5 educrec        LG10negaff             MAH_1  \
0       4      51      1      1       4   1.6232492903979  .958016199851603   
1       5      40      2      3       4  1.54406804435028  3.64075332624113   
2       8      47      3      4       1  1.14612803567824  1.30741786655502   
3       3      63      2      4       1  1.55630250076729  1.31417610205586   
4       4      46      1      1       4  1.04139268515823  2.71888737968259   

                   COO_1             MAH_2  

Display the data in the required format

In [17]:
import pandas as pd
import statsmodels.api as sm
import requests
from io import StringIO

# Downloading the CSV data from Dropbox
dropbox_link = 'https://www.dropbox.com/scl/fi/r52t1up2b764d9ozkgga4/survey3ED.csv?rlkey=47nc7liknev0yfkuov3a9d7jq&dl=1'
r = requests.get(dropbox_link)
data = StringIO(r.text)
csv_data = pd.read_csv(data)

# Extracting the relevant columns for regression analysis
dependent_var = 'tpstress'  # Dependent variable
predictors = ['tpcoiss', 'tmast']  # Predictor variables

# Checking if the necessary columns are present in the dataframe
if not all(column in csv_data.columns for column in [dependent_var] + predictors):
    raise ValueError("The dataset does not contain all the required columns for the regression analysis.")

# Cleaning the data
csv_data[predictors] = csv_data[predictors].apply(pd.to_numeric, errors='coerce')
csv_data[dependent_var] = pd.to_numeric(csv_data[dependent_var], errors='coerce')
clean_data = csv_data.dropna(subset=[dependent_var] + predictors)

# Defining the model with the cleaned data
X_clean = clean_data[predictors]
y_clean = clean_data[dependent_var]
X_clean = sm.add_constant(X_clean)

# Fitting the regression model with the clean data
regression_model_clean = sm.OLS(y_clean, X_clean).fit()

# Calculate zero-order correlations (Pearson r)
zero_order_correlations = clean_data[predictors].apply(lambda x: x.corr(clean_data[dependent_var]))

# Calculate the multiple correlation coefficient (R)
multiple_R = regression_model_clean.rsquared**0.5

# Calculate structure coefficients (correlation divided by multiple correlation)
structure_coefficients = zero_order_correlations / multiple_R

# Construct the table and format the numerical results to three decimals
summary_data = {
    'Model': ['Constant'] + predictors,
    'b': regression_model_clean.params.round(3),
    'SE-b': regression_model_clean.bse.round(3),
    'Beta': standardized_betas.round(3),
    'Pearson r': zero_order_correlations.round(3),
    'sr2': [None] * (len(predictors) + 1),  # placeholder for semi-partial correlation coefficients
    'Structure Coefficient': structure_coefficients.round(3)
}

# Fill in sr2 values where applicable and round them to three decimals
for i, predictor in enumerate(predictors, start=1):  # Start at 1 to skip 'Constant'
    summary_data['sr2'][i] = round(sr2_values[predictor], 3)

summary_df = pd.DataFrame(summary_data)
summary_df['sr2'] = summary_df['sr2'].fillna('')  # Replace None with empty strings for presentation

# Add R-squared and Adjusted R-squared at the bottom of the DataFrame and round them to three decimals
additional_rows = pd.DataFrame({
    'Model': ['R-squared', 'Adjusted R-squared'],
    'b': [round(regression_model_clean.rsquared, 3), round(regression_model_clean.rsquared_adj, 3)],
    'SE-b': ['', ''],
    'Beta': ['', ''],
    'Pearson r': ['', ''],
    'sr2': ['', ''],
    'Structure Coefficient': ['', '']
})

# Combine the original summary with the additional rows
summary_df = pd.concat([summary_df, additional_rows], ignore_index=True)

# Print out the formatted table
print(summary_df.to_string(index=False))


             Model      b   SE-b   Beta Pearson r    sr2 Structure Coefficient
          Constant 50.828  1.271    0.0       NaN                          NaN
           tpcoiss -0.621  0.061 -0.422    -0.611  0.092                -0.895
             tmast -0.175   0.02 -0.358    -0.581  0.129                -0.851
         R-squared  0.466                                                     
Adjusted R-squared  0.463                                                     


In [19]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import requests
from io import StringIO

# Downloading the CSV data from Dropbox
dropbox_link = 'https://www.dropbox.com/scl/fi/r52t1up2b764d9ozkgga4/survey3ED.csv?rlkey=47nc7liknev0yfkuov3a9d7jq&dl=1'
r = requests.get(dropbox_link)
data = StringIO(r.text)
csv_data = pd.read_csv(data)

# Extracting the relevant columns for regression analysis
dependent_var = 'tpstress'  # Dependent variable
predictors = ['tpcoiss', 'tmast']  # Predictor variables

# Checking if the necessary columns are present in the dataframe
if not all(column in csv_data.columns for column in [dependent_var] + predictors):
    raise ValueError("The dataset does not contain all the required columns for the regression analysis.")

# Cleaning the data
csv_data[predictors] = csv_data[predictors].apply(pd.to_numeric, errors='coerce')
csv_data[dependent_var] = pd.to_numeric(csv_data[dependent_var], errors='coerce')
clean_data = csv_data.dropna(subset=[dependent_var] + predictors)

# Defining the model with the cleaned data
X_clean = clean_data[predictors]
y_clean = clean_data[dependent_var]
X_clean = sm.add_constant(X_clean)

# Fitting the regression model with the clean data
regression_model_clean = sm.OLS(y_clean, X_clean).fit()

# Calculate zero-order correlations (Pearson r)
zero_order_correlations = clean_data[predictors].apply(lambda x: x.corr(clean_data[dependent_var]))

# Calculate the multiple correlation coefficient (R)
multiple_R = np.sqrt(regression_model_clean.rsquared)

# Calculate structure coefficients (correlation divided by multiple correlation)
structure_coefficients = zero_order_correlations / multiple_R

# Calculate sr2 values
sr2_values = {}
for predictor in predictors:
    # Construct a model without the current predictor
    other_predictors = [p for p in predictors if p != predictor]
    X_other = sm.add_constant(clean_data[other_predictors])
    reduced_model = sm.OLS(y_clean, X_other).fit()
    # Calculate the change in R-squared
    sr2 = regression_model_clean.rsquared - reduced_model.rsquared
    sr2_values[predictor] = sr2

# Construct the table and format the numerical results to three decimals
summary_data = {
    'Model': ['Constant'] + predictors,
    'b': regression_model_clean.params.round(3),
    'SE-b': regression_model_clean.bse.round(3),
    'Beta': '',  # Placeholder for standardized betas
    'Pearson r': zero_order_correlations.round(3),
    'sr2': [np.nan] + [round(v, 3) for v in sr2_values.values()],
    'Structure Coefficient': structure_coefficients.round(3)
}

summary_df = pd.DataFrame(summary_data)

# Replace NaN with empty strings for presentation
summary_df['sr2'] = summary_df['sr2'].fillna('')

# Add R-squared and Adjusted R-squared at the bottom of the DataFrame and round them to three decimals
additional_rows = pd.DataFrame({
    'Model': ['R-squared', 'Adjusted R-squared'],
    'b': [round(regression_model_clean.rsquared, 3), round(regression_model_clean.rsquared_adj, 3)],
    'SE-b': ['', ''],
    'Beta': ['', ''],
    'Pearson r': ['', ''],
    'sr2': ['', ''],
    'Structure Coefficient': ['', '']
})

# Combine the original summary with the additional rows
summary_df = pd.concat([summary_df, additional_rows], ignore_index=True)

# Print out the formatted table
print(summary_df.to_string(index=False))


             Model      b   SE-b Beta Pearson r    sr2 Structure Coefficient
          Constant 50.828  1.271            NaN                          NaN
           tpcoiss -0.621  0.061         -0.611  0.092                -0.895
             tmast -0.175   0.02         -0.581  0.129                -0.851
         R-squared  0.466                                                   
Adjusted R-squared  0.463                                                   
