# Without Nomarlity Check

In [4]:
import pandas as pd
import numpy as np
from google.colab import files

# Prompt user to upload a file
uploaded = files.upload()
filename = list(uploaded.keys())[0]  # Get the uploaded file name

df = pd.read_csv(filename)

# Convert all columns to numeric if possible (coerce errors)
df = df.apply(pd.to_numeric, errors='coerce')

# Drop rows with all NaN values
df = df.dropna(how='all')

numeric_data = df.select_dtypes(include=[np.number])

# Compute mean and covariance for the full dataset (for Mahalanobis), if possible
if numeric_data.shape[1] > 0:
    data_mean = numeric_data.mean(axis=0).values
    data_cov = np.cov(numeric_data, rowvar=False)
    try:
        data_cov_inv = np.linalg.inv(data_cov)
        mahalanobis_available = True
    except np.linalg.LinAlgError:
        mahalanobis_available = False
else:
    mahalanobis_available = False

def mahalanobis_distance(vals, mean, cov_inv):
    diff = vals - mean
    md = np.sqrt(diff @ cov_inv @ diff.T)
    return md

def check_patterns(row):
    vals = row.values
    vals = vals[~np.isnan(vals)]  # remove NaNs if any

    # Convert to integers
    vals = vals.astype(int)

    # If no data or single value, return OK
    if len(vals) <= 1:
        return "OK"

    # Outlier check using Mahalanobis distance (per-row)
    # Only if covariance is invertible and the row matches the number of columns
    if mahalanobis_available and len(vals) == numeric_data.shape[1]:
        md = mahalanobis_distance(vals, data_mean, data_cov_inv)
        if md > 24.32:
            return f"Outlier (Mahalanobis={int(md)})"

    # SD check
    sd = np.std(vals, ddof=1)
    if sd < 0.5:
        return "SD < 0.5"

    # Check for a constant sequence of length 36
    for i in range(len(vals) - 35):
        window = vals[i:i+36]
        if np.all(window == window[0]):
            return f"Constant: {list(map(int, window))}"

    # Check for an increasing sequence of length 6
    for i in range(len(vals) - 5):
        window = vals[i:i+6]
        if np.all(window[1:] > window[:-1]):
            return f"Increasing: {list(map(int, window))}"

    # Check for a decreasing sequence of length 6
    for i in range(len(vals) - 5):
        window = vals[i:i+6]
        if np.all(window[1:] < window[:-1]):
            return f"Decreasing: {list(map(int, window))}"

    # If no pattern found
    return "OK"

df['Warning'] = df.apply(check_patterns, axis=1)
df.to_csv('Cleaned.csv', index=False)
print("Cleaned.csv has been generated with a new 'Warning' column.")


Saving User Perceptions of AI Tools in Linguistics Studies .csv to User Perceptions of AI Tools in Linguistics Studies  (3).csv
