# Xử lí Sentiment Analysis cho tiếng Việt bằng thư viện Underthesea

https://github.com/undertheseanlp/underthesea

In [5]:
!pip install underthesea
import pandas as pd
import re
import requests
import io
from underthesea import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Download necessary NLTK data
nltk.download('vader_lexicon')

# Custom Vietnamese stopwords list
vietnamese_stopwords = ['là', 'có', 'và', 'của', 'cho', 'trên', 'về', 'như', 'cũng', 'đã', 'được', 'với', 'một', 'các', 'từ', 'nhưng', 'không', 'thì', 'ở']

# Download the file from Dropbox
url = "https://www.dropbox.com/scl/fi/w3iro73xstcle3qoewqf9/classified_posts_processed.csv?rlkey=zhg7komfujn61p4vsm35p34yp&dl=1"
response = requests.get(url)
content = response.content

# Read the CSV file into a DataFrame
df = pd.read_csv(io.StringIO(content.decode('utf-8')))

# Clean text function
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^A-Za-zÀ-ỹà-ỹ\s]', '', text)  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    words = word_tokenize(text, format="text")  # Tokenize using Underthesea
    words = [word for word in words.split() if word not in vietnamese_stopwords]  # Remove stopwords
    cleaned_text = ' '.join(words)  # Join words back into a single string
    return cleaned_text

# Apply text cleaning
df['Cleaned_Text'] = df['Post'].apply(clean_text)

# Sentiment Analysis using VADER
sid = SentimentIntensityAnalyzer()
df['Sentiment'] = df['Cleaned_Text'].apply(lambda x: sid.polarity_scores(x)['compound'])

# Classify sentiment
df['Sentiment_Class'] = df['Sentiment'].apply(lambda x: 'Positive' if x > 0 else ('Negative' if x < 0 else 'Neutral'))

# Save results to a new CSV file
df.to_csv('sentiment_analysis_results.csv', index=False)

# Display the first few rows of the result
print(df.head())


Collecting underthesea
  Downloading underthesea-6.8.0-py3-none-any.whl (20.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m67.4 MB/s[0m eta [36m0:00:00[0m
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m69.5 MB/s[0m eta [36m0:00:00[0m
Collecting underthesea-core==1.0.4 (from underthesea)
  Downloading underthesea_core-1.0.4-cp310-cp310-manylinux2010_x86_64.whl (657 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m657.8/657.8 kB[0m [31m54.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: underthesea-core, python-crfsuite, underthesea
Successfully installed python-crfsuite-0.9.10 underthesea-6.8.0 underthesea-core-1.0.4


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


         Date      Time                                               Post  \
0         NaN       NaN  VÉ BẠN MUỐN HẸN HÒ Show ĐANG VƠI DẦN MÃO THÌN ...   
1         NaN       NaN  MỞ ĐƠN ĐĂNG KÝ THAM GIA "BẠN MUỐN HẸN HÒ Show"...   
2         NaN       NaN         Góc khuất của những ngôi trường có 2 cơ sở   
3         NaN       NaN  Những toà nhà ở USSH HCM cơ sở Thủ Đức (phần 3...   
4  27/04/2024  21:07:40  \nGửi các bạn học khoa s môn thầy L sáng thứ 4...   

                              Classifiers  \
0  Hoạt động Xã hội và Đời sống Sinh viên   
1  Hoạt động Xã hội và Đời sống Sinh viên   
2                     Giáo dục và Học tập   
3                     Giáo dục và Học tập   
4  Hoạt động Xã hội và Đời sống Sinh viên   

                                        Cleaned_Text  Sentiment  \
0  vé bạn muốn hẹn_hò show đang vơi dần mão thìn ...      0.000   
1  mở đơn đăng_ký tham_gia bạn muốn hẹn_hò show c...     -0.128   
2                  góc khuất những ngôi trường cơ_sở      0

# Use PhoBERT instead of VADER for Vietnmese sentiment analysis tasks

In [None]:
!pip install underthesea transformers torch datasets

import pandas as pd
import re
import requests
import io
from underthesea import word_tokenize
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import torch
import numpy as np

# Custom Vietnamese stopwords list
vietnamese_stopwords = ['là', 'có', 'và', 'của', 'cho', 'trên', 'về', 'như', 'cũng', 'đã', 'được', 'với', 'một', 'các', 'từ', 'nhưng', 'không', 'thì', 'ở']

# Download the file from Dropbox
url = "https://www.dropbox.com/scl/fi/w3iro73xstcle3qoewqf9/classified_posts_processed.csv?rlkey=zhg7komfujn61p4vsm35p34yp&dl=1"
response = requests.get(url)
content = response.content

# Read the CSV file into a DataFrame
df = pd.read_csv(io.StringIO(content.decode('utf-8')))

# Clean text function
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^A-Za-zÀ-ỹà-ỹ\s]', '', text)  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    words = word_tokenize(text, format="text")  # Tokenize using underthesea
    words = [word for word in words.split() if word not in vietnamese_stopwords]  # Remove stopwords
    cleaned_text = ' '.join(words)  # Join words back into a single string
    return cleaned_text

# Apply text cleaning
df['Cleaned_Text'] = df['Post'].apply(clean_text)

# Load a sample sentiment analysis dataset for Vietnamese
dataset = load_dataset("vietnlp/load_sst")

# Split dataset into train and test
train_dataset = dataset['train']
test_dataset = dataset['test']

# Load the PhoBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=3)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Define compute metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {'accuracy': (preds == p.label_ids).mean()}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("phobert-sentiment")
tokenizer.save_pretrained("phobert-sentiment")

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("phobert-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("phobert-sentiment")

# Sentiment Analysis function using the fine-tuned model
def analyze_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    outputs = model(**inputs)
    scores = outputs[0][0].detach().numpy()
    scores = torch.softmax(torch.tensor(scores), dim=0)
    sentiment_score = scores[1] - scores[0]  # Simplified scoring: positive - negative
    return sentiment_score.item()

# Apply sentiment analysis
df['Sentiment'] = df['Cleaned_Text'].apply(analyze_sentiment)

# Classify sentiment
df['Sentiment_Class'] = df['Sentiment'].apply(lambda x: 'Positive' if x > 0 else ('Negative' if x < 0 else 'Neutral'))

# Save results to a new CSV file
df.to_csv('sentiment_analysis_results.csv', index=False)

# Display the first few rows of the result
print(df.head())


Collecting underthesea
  Downloading underthesea-6.8.0-py3-none-any.whl (20.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
Collecting underthesea-core==1.0.4 (from underthesea)
  Downloading underthesea_core-1.0.4-cp310-cp310-manylinux2010_x86_64.whl (657 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m657.8/657.8 kB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Us

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetNotFoundError: Dataset 'vietnlp/load_sst' doesn't exist on the Hub or cannot be accessed. If the dataset is private or gated, make sure to log in with `huggingface-cli login` or visit the dataset page at https://huggingface.co/datasets/vietnlp/load_sst to ask for access.