# 1. Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('fivethirtyeight')
sns.set(style='whitegrid', palette='muted', font_scale=1.2)

# 2. Load and Explore the Data
# Load the dataset
df = pd.read_csv('data.csv')

# Display basic information
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows:")
display(df.head())

# Check columns and data types
print("\nData types:")
df.info()

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Check class distribution
print("\nClass distribution:")
print(df['Sentiment'].value_counts())
print(df['Sentiment'].value_counts(normalize=True) * 100)

# Display examples from each class
print("\nExample of positive sentence:")
print(df[df['Sentiment'] == 'positive']['Sentence'].iloc[0])
print("\nExample of negative sentence:")
print(df[df['Sentiment'] == 'negative']['Sentence'].iloc[0])
if 'neutral' in df['Sentiment'].unique():
    print("\nExample of neutral sentence:")
    print(df[df['Sentiment'] == 'neutral']['Sentence'].iloc[0])

Dataset shape: (5842, 2)

First 5 rows:

Data types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5842 entries, 0 to 5841
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentence   5842 non-null   object
 1   Sentiment  5842 non-null   object
dtypes: object(2)
memory usage: 91.4+ KB

Missing values:
Sentence     0
Sentiment    0
dtype: int64

Class distribution:
Sentiment
neutral     3130
positive    1852
negative     860
Name: count, dtype: int64
Sentiment
neutral     53.577542
positive    31.701472
negative    14.720986
Name: proportion, dtype: float64

Example of positive sentence:
The GeoSolutions technology will leverage Benefon 's GPS solutions by providing Location Based Search Technology , a Communities Platform , location relevant multimedia content and a new and powerful commercial model .

Example of negative sentence:
$ESI on lows, down $1.50 to $2.50 BK a real possibility

Example of neutral sentence:
According to the Finnish-Russian Chamber of Commerce , all the major construction companies of Finland are operating in Russia .

# 3.1 Sentiment Distribution Visualization
plt.figure(figsize=(10, 6))
ax = sns.countplot(x='Sentiment', data=df, palette='viridis')
plt.title('Distribution of Financial Sentiment', fontsize=16)
plt.xlabel('Sentiment', fontsize=14)
plt.ylabel('Count', fontsize=14)

# Add count labels on top of bars
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'bottom', 
                fontsize=12)
plt.tight_layout()
plt.show()

# 3.2 Text Length Analysis
df['text_length'] = df['Sentence'].apply(len)

plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='text_length', hue='Sentiment', bins=50, kde=True, palette='viridis')
plt.title('Distribution of Text Length by Sentiment', fontsize=16)
plt.xlabel('Text Length (characters)', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xlim(0, df['text_length'].quantile(0.99))  # Remove outliers for better visualization
plt.tight_layout()
plt.show()

# 3.3 Word Count Analysis
df['word_count'] = df['Sentence'].apply(lambda x: len(str(x).split()))

plt.figure(figsize=(12, 6))
sns.boxplot(x='Sentiment', y='word_count', data=df, palette='viridis')
plt.title('Word Count Distribution by Sentiment', fontsize=16)
plt.xlabel('Sentiment', fontsize=14)
plt.ylabel('Word Count', fontsize=14)
plt.ylim(0, df['word_count'].quantile(0.99))  # Remove outliers for better visualization
plt.tight_layout()
plt.show()

# 4.1 Text Preprocessing Function
def preprocess_text(text):
    """
    Function to preprocess text data
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|\#', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.update(['s', 't', 've', 'll', 'd', 'm'])  # Add some contractions
    tokens = [word for word in tokens if word not in stop_words and len(word) > 1]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back to text
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# 4.2 Apply Preprocessing
print("Preprocessing text data...")
df['cleaned_text'] = df['Sentence'].apply(preprocess_text)
print("Preprocessing complete!")

# Display examples of preprocessed text
print("\nOriginal vs Cleaned Text Examples:")
for i in range(3):
    print(f"\nOriginal: {df['Sentence'].iloc[i]}")
    print(f"Cleaned: {df['cleaned_text'].iloc[i]}")

Preprocessing text data...
Preprocessing complete!

Original vs Cleaned Text Examples:

Original: The GeoSolutions technology will leverage Benefon 's GPS solutions by providing Location Based Search Technology , a Communities Platform , location relevant multimedia content and a new and powerful commercial model .
Cleaned: geosolutions technology leverage benefon gps solution providing location based search technology community platform location relevant multimedia content new powerful commercial model

Original: $ESI on lows, down $1.50 to $2.50 BK a real possibility
Cleaned: esi low bk real possibility

Original: For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .
Cleaned: last quarter componenta net sale doubled eurm eurm period year earlier moved zero pretax profit pretax loss eurm

# 5.1 Word Frequency Analysis
def get_top_n_words(corpus, n=20):
    """
    Gets the top n words from a corpus of text
    """
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

# Get top words for each sentiment class
positive_words = get_top_n_words(df[df['Sentiment'] == 'positive']['cleaned_text'], 20)
negative_words = get_top_n_words(df[df['Sentiment'] == 'negative']['cleaned_text'], 20)
if 'neutral' in df['Sentiment'].unique():
    neutral_words = get_top_n_words(df[df['Sentiment'] == 'neutral']['cleaned_text'], 20)
    has_neutral = True
else:
    has_neutral = False

# 5.2 Visualize Top Words
# Create dataframes for visualization
positive_df = pd.DataFrame(positive_words, columns=['word', 'count'])
negative_df = pd.DataFrame(negative_words, columns=['word', 'count'])
if has_neutral:
    neutral_df = pd.DataFrame(neutral_words, columns=['word', 'count'])

# Create bar charts for top words
if has_neutral:
    fig, axes = plt.subplots(1, 3, figsize=(24, 8))
else:
    fig, axes = plt.subplots(1, 2, figsize=(18, 8))

sns.barplot(x='count', y='word', data=positive_df, ax=axes[0], palette=['#55a868'])
axes[0].set_title('Top Words in Positive Reviews', fontsize=16)

sns.barplot(x='count', y='word', data=negative_df, ax=axes[1], palette=['#c44e52'])
axes[1].set_title('Top Words in Negative Reviews', fontsize=16)

if has_neutral:
    sns.barplot(x='count', y='word', data=neutral_df, ax=axes[2], palette=['#4c72b0'])
    axes[2].set_title('Top Words in Neutral Reviews', fontsize=16)

plt.tight_layout()
plt.show()

# 6. Feature Engineering and Model Training

# Convert sentiment labels to numeric
sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2} if has_neutral else {'negative': 0, 'positive': 1}
df['sentiment_code'] = df['Sentiment'].map(sentiment_map)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'], 
    df['sentiment_code'], 
    test_size=0.2, 
    random_state=42,
    stratify=df['sentiment_code']
)

# Feature extraction with TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Display feature dimensions
print(f"Training features shape: {X_train_tfidf.shape}")
print(f"Testing features shape: {X_test_tfidf.shape}")

Training features shape: (4673, 5000)
Testing features shape: (1169, 5000)

# 7.1 Model Evaluation Function
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    # Train model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    print(f"--- {model_name} Results ---")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nConfusion Matrix:")
    print(conf_matrix)
    print("\nClassification Report:")
    print(class_report)
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=sentiment_map.keys(),
                yticklabels=sentiment_map.keys())
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.tight_layout()
    plt.show()
    
    return model, accuracy

# 7.2 Model 1: Multinomial Naive Bayes
nb_model = MultinomialNB()
nb_model, nb_accuracy = evaluate_model(
    nb_model, X_train_tfidf, X_test_tfidf, y_train, y_test, "Naive Bayes"
)

--- Naive Bayes Results ---
Accuracy: 0.6835

Confusion Matrix:
[[  7 119  46]
 [  3 598  25]
 [  1 176 194]]

Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.04      0.08       172
           1       0.67      0.96      0.79       626
           2       0.73      0.52      0.61       371

    accuracy                           0.68      1169
   macro avg       0.68      0.51      0.49      1169
weighted avg       0.68      0.68      0.63      1169

# 7.3 Model 2: Logistic Regression
lr_model = LogisticRegression(max_iter=1000, C=1.0, solver='lbfgs', n_jobs=-1)
lr_model, lr_accuracy = evaluate_model(
    lr_model, X_train_tfidf, X_test_tfidf, y_train, y_test, "Logistic Regression"
)

--- Logistic Regression Results ---
Accuracy: 0.6929

Confusion Matrix:
[[ 23 116  33]
 [ 30 556  40]
 [  8 132 231]]

Classification Report:
              precision    recall  f1-score   support

           0       0.38      0.13      0.20       172
           1       0.69      0.89      0.78       626
           2       0.76      0.62      0.68       371

    accuracy                           0.69      1169
   macro avg       0.61      0.55      0.55      1169
weighted avg       0.67      0.69      0.66      1169

# 7.4 Model 3: Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model, rf_accuracy = evaluate_model(
    rf_model, X_train_tfidf, X_test_tfidf, y_train, y_test, "Random Forest"
)

--- Random Forest Results ---
Accuracy: 0.6330

Confusion Matrix:
[[ 19 122  31]
 [ 84 501  41]
 [ 11 140 220]]

Classification Report:
              precision    recall  f1-score   support

           0       0.17      0.11      0.13       172
           1       0.66      0.80      0.72       626
           2       0.75      0.59      0.66       371

    accuracy                           0.63      1169
   macro avg       0.53      0.50      0.51      1169
weighted avg       0.62      0.63      0.62      1169

# 8. Model Comparison
model_comparison = pd.DataFrame({
    'Model': ['Naive Bayes', 'Logistic Regression', 'Random Forest'],
    'Accuracy': [nb_accuracy, lr_accuracy, rf_accuracy]
})

plt.figure(figsize=(10, 6))
ax = sns.barplot(x='Model', y='Accuracy', data=model_comparison, palette='viridis')
plt.title('Model Accuracy Comparison', fontsize=16)
plt.ylim(0, 1.0)

# Add accuracy values on top of bars
for p in ax.patches:
    ax.annotate(f'{p.get_height():.4f}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'bottom', 
                fontsize=12)

plt.tight_layout()
plt.show()

# 9. Feature Importance (for the best model)
if lr_accuracy >= nb_accuracy and lr_accuracy >= rf_accuracy:
    best_model = "Logistic Regression"
    # Get feature importance from Logistic Regression
    feature_importance = pd.DataFrame({
        'feature': tfidf_vectorizer.get_feature_names_out(),
        'importance': lr_model.coef_[0] if not has_neutral else lr_model.coef_.mean(axis=0)
    })
    
elif rf_accuracy >= nb_accuracy and rf_accuracy >= lr_accuracy:
    best_model = "Random Forest"
    # Get feature importance from Random Forest
    feature_importance = pd.DataFrame({
        'feature': tfidf_vectorizer.get_feature_names_out(),
        'importance': rf_model.feature_importances_
    })
    
else:
    best_model = "Naive Bayes"
    # For Naive Bayes, we can use the log probabilities
    feature_importance = pd.DataFrame({
        'feature': tfidf_vectorizer.get_feature_names_out(),
        'importance': np.abs(nb_model.feature_log_prob_[1] - nb_model.feature_log_prob_[0])
    })

# Sort features by importance
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Plot top 20 important features
plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_importance.head(20))
plt.title(f'Top 20 Important Features ({best_model})', fontsize=16)
plt.tight_layout()
plt.show()

# 10.1 Sample Prediction Function
def predict_sentiment(text, model, vectorizer, sentiment_map_inv):
    # Preprocess the text
    cleaned = preprocess_text(text)
    
    # Vectorize
    text_vectorized = vectorizer.transform([cleaned])
    
    # Predict
    prediction = model.predict(text_vectorized)[0]
    proba = model.predict_proba(text_vectorized)[0]
    
    # Map back to sentiment label
    sentiment = sentiment_map_inv[prediction]
    
    return sentiment, proba

# Create inverse mapping
sentiment_map_inv = {v: k for k, v in sentiment_map.items()}

# Find the best model
if lr_accuracy >= nb_accuracy and lr_accuracy >= rf_accuracy:
    best_model_obj = lr_model
    best_model_name = "Logistic Regression"
elif rf_accuracy >= nb_accuracy and rf_accuracy >= lr_accuracy:
    best_model_obj = rf_model
    best_model_name = "Random Forest"
else:
    best_model_obj = nb_model
    best_model_name = "Naive Bayes"

# 10.2 Sample Predictions
sample_texts = [
    "The company reported strong earnings, beating analyst expectations with record revenue.",
    "The stock plummeted after the company announced significant losses in the last quarter.",
    "The market remained stable today with minor fluctuations across major indices."
]

print("\nSample Predictions:")
for text in sample_texts:
    sentiment, proba = predict_sentiment(text, best_model_obj, tfidf_vectorizer, sentiment_map_inv)
    print(f"\nText: {text}")
    print(f"Predicted Sentiment: {sentiment}")
    print("Class Probabilities:")
    for i, label in sentiment_map_inv.items():
        print(f"  {label}: {proba[i]:.4f}")

Sample Predictions:

Text: The company reported strong earnings, beating analyst expectations with record revenue.
Predicted Sentiment: positive
Class Probabilities:
  negative: 0.0516
  neutral: 0.1593
  positive: 0.7891

Text: The stock plummeted after the company announced significant losses in the last quarter.
Predicted Sentiment: positive
Class Probabilities:
  negative: 0.1384
  neutral: 0.3055
  positive: 0.5561

Text: The market remained stable today with minor fluctuations across major indices.
Predicted Sentiment: neutral
Class Probabilities:
  negative: 0.1749
  neutral: 0.5133
  positive: 0.3118

Financial Sentiment Analysis¶

Dataset Information¶

Sentiment Distribution Analysis¶

Implications for Modeling¶

Text Length Distribution Analysis by Sentiment¶

Key Observations¶

Modeling Considerations¶

Word Count Distribution Analysis by Sentiment¶

Main Observations¶

Interpretation¶

Processing Implications¶

Word Frequency Analysis by Sentiment Category¶

Common Terms Across Categories¶

Distinctive Patterns by Sentiment¶

Model Implications¶

Potential Improvements¶

Naive Bayes Model Analysis¶

Performance Analysis¶

Identified Problems¶

Possible Improvements¶

Logistic Regression Model Analysis¶

Overall Evaluation¶

Observed Improvements¶

Additional Potential Improvements¶

Conclusion¶

Random Forest Model Analysis¶

Overall Evaluation¶

Key Observations¶

Why is Performance Lower?¶

Potential Improvements¶

Conclusion¶

Feature Importance and Prediction Analysis¶

Most Influential Features (Logistic Regression)¶

Sample Prediction Analysis¶

Conclusions¶

Key Findings¶

Practical Applications¶

Limitations and Considerations¶

Next Steps¶

	Sentence	Sentiment
0	The GeoSolutions technology will leverage Bene...	positive
1	$ESI on lows, down $1.50 to $2.50 BK a real po...	negative
2	For the last quarter of 2010 , Componenta 's n...	positive
3	According to the Finnish-Russian Chamber of Co...	neutral
4	The Swedish buyout firm has sold its remaining...	neutral