-
Notifications
You must be signed in to change notification settings - Fork 0
/
main1.py
105 lines (83 loc) · 4.01 KB
/
main1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import numpy as np
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Bidirectional, TimeDistributed, Dropout, Layer
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import initializers
from sklearn.metrics import accuracy_score, classification_report
# Load the dataset
df = pd.read_csv('BBC News Text.csv')
# Text preprocessing
def preprocess_text(text):
# Tokenization
tokens = word_tokenize(text)
# Lowercasing
tokens = [word.lower() for word in tokens]
# Removing stopwords
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]
# Lemmatization
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word) for word in tokens]
return ' '.join(tokens)
df['clean_text'] = df['text'].apply(preprocess_text)
# Tokenization and padding
MAX_SENTENCE_LENGTH = 50
MAX_SENTENCES = 15
MAX_WORDS = 10000
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(df['clean_text'])
X = tokenizer.texts_to_sequences(df['clean_text'])
X = pad_sequences(X, maxlen=MAX_SENTENCE_LENGTH, padding='post', truncating='post')
# Splitting into train and test sets
y = pd.get_dummies(df['category']).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Hierarchical Attention Network architecture
class HierarchicalAttention(Layer):
def __init__(self, attention_dim, **kwargs):
self.init = initializers.get('normal')
self.supports_masking = True
self.attention_dim = attention_dim
super(HierarchicalAttention, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 3
self.W = self.add_weight(name='W', shape=(input_shape[-1], self.attention_dim), initializer='normal')
self.b = self.add_weight(name='b', shape=(self.attention_dim,), initializer='normal')
self.u = self.add_weight(name='u', shape=(self.attention_dim, 1), initializer='normal')
super(HierarchicalAttention, self).build(input_shape)
def compute_mask(self, inputs, mask=None):
return mask
def call(self, x, mask=None):
uit = K.bias_add(K.dot(x, self.W), self.b)
uit = K.tanh(uit)
ait = K.dot(uit, self.u)
ait = K.squeeze(ait, -1)
ait = K.exp(ait)
if mask is not None:
ait *= K.cast(mask, K.floatx())
ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
weighted_input = x * K.expand_dims(ait)
return K.sum(weighted_input, axis=1)
def compute_output_shape(self, input_shape):
return (input_shape[0], input_shape[-1])
# Model architecture
sentence_input = Input(shape=(MAX_SENTENCE_LENGTH,), dtype='int32')
embedded_sequences = Embedding(MAX_WORDS, 100, input_length=MAX_SENTENCE_LENGTH)(sentence_input)
lstm_word = Bidirectional(LSTM(50, return_sequences=True))(embedded_sequences)
word_attention = HierarchicalAttention(100)(lstm_word)
sentence_encoder = Model(sentence_input, word_attention)
review_input = Input(shape=(MAX_SENTENCES, MAX_SENTENCE_LENGTH), dtype='int32')
review_encoder = TimeDistributed(sentence_encoder)(review_input)
lstm_sentence = Bidirectional(LSTM(50, return_sequences=True))(review_encoder)
sentence_attention = HierarchicalAttention(100)(lstm_sentence)
output = Dense(5, activation='softmax')(sentence_attention)
model = Model(review_input, output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train, y_train, validation_split=0.1, epochs=10, batch_size=64, callbacks=[early_stopping])
print(classification_report(y_test, y_pred))