import pandas as pd
import numpy as np
import urllib.request
import os
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
from transformers import BertTokenizer, TFBertModel

# 네이버 영화 리뷰 데이터 학습을 위해 훈련 데이터와 테스트 데이터를 다운로드합니다.
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt",
                           filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt",
                           filename="ratings_test.txt")

train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')
print('훈련용 리뷰: ',len(train_data))
print('테스트용 리뷰: ',len(test_data))

훈련용 리뷰: 150000 테스트용 리뷰: 50000

train_data.head()

id	document	label
9976970	아 더빙.. 진짜 짜증나네요 목소리	0
3819312	흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나	1
10265843	너무재밓었다그래서보는것을추천한다	0
9045019	교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정	0
6483659	사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...	1

train_data.info()
"""
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        150000 non-null  int64 
 1   document  149995 non-null  object
 2   label     150000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.4+ MB
"""
test_data.info()
"""
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        50000 non-null  int64 
 1   document  49997 non-null  object
 2   label     50000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.1+ MB
"""

결측치 존제

train_data['document'].value_counts().head() # 중복된 데이터들 많다. 이를 제거해야 한다.

document count
	
굿		181
good		92
최고		85
쓰레기		79
별로		66

train_data.loc[train_data['document'].isna()] # 결측치 있다. 이또한 제거해야한다.
	id	document	label
25857	2172111	NaN	1
55737	6369843	NaN	1
110014	1034280	NaN	0
126782	5942978	NaN	0
140721	1034283	NaN	0

# 중복데이터와 결측데이터 제거
train_data.drop_duplicates(subset=['document'], inplace=True)
train_data.dropna(how='any',inplace=True)

print('훈련용 리뷰 개수: ',len(train_data))

훈련용 리뷰 개수: 146182

test_data.dropna(how='any',inplace=True)
print('테스트 데이터의 리뷰수: ',len(test_data))

테스트 데이터의 리뷰수: 49997

BERT의 입력

BERT 의 입력을 세가지 준비해야한다
1. 정수 인코딩
2. 세그먼트 인코딩 (문장 구분)
3. 어텐션 마스크 (단어코튼, 패딩토큰 구분)

tokenizer = BertTokenizer.from_pretrained('klue/bert-base') # 한국어에 최적화된 토크나이져

정수 인코딩 + 패딩

max_seq_len =128

encoded_result = tokenizer.encode('전율을 일으키는 영화. 다시 보고 싶은 영화',
                                  padding='max_length',
                                  max_length=max_seq_len,
                                  )
print(encoded_result)
print(len(encoded_result))

[2, 1537, 2534, 2069, 6572, 2259, 3771, 18, 3690, 4530, 1335, 2073, 3771, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
128

세그먼트 인코딩

#어짜피 입력은 '하나의 텍스트' -> 전부 0으로 채움
print([0] * max_seq_len)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

어텐션 마스크 인코딩

valid_num = len(tokenizer.encode('전율을 일으키는 영화. 다시 보고 싶은 영화'))
print(valid_num * [1] + (max_seq_len - valid_num)*[0])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

입력된 전체 데이터에 대해서 이과정을 진행하는 함수

def convert_examples_to_features(examples, labels, max_seq_len, tokenizer):
  # input_ids: 워드 임베딩을 위한 문장의 정수인코딩
  # attention_masks: 어텐션 마스크 인코딩
  # token_type_ids: 세그먼트 인코딩
  input_ids, attention_masks, token_type_ids, data_labels= [],[],[],[]

  for example,lable in tqdm(zip(examples, labels), total=len(examples)):
    input_id = tokenizer.encode(example, padding='max_length', max_length=max_seq_len,truncation=True)
    padding_count = input_id.count(tokenizer.pad_token_id)
    attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
    token_type_id = [0] * max_seq_len

    assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
    assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
    assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)

    input_ids.append(input_id)
    attention_masks.append(attention_mask)
    token_type_ids.append(token_type_id)
    data_labels.append(lable)

  # TFBertModel 의 tf.Tensor 나 numpy 입력 기대.
  input_ids = np.array(input_ids, dtype=int)
  attention_masks = np.array(attention_masks, dtype=int)
  token_type_ids = np.array(token_type_ids,dtype=int)

  data_labels = np.asarray(data_labels, dtype=np.int32)

  return (input_ids, attention_masks, token_type_ids), data_labels

# 훈련데이터에 대해서 진행
train_X, train_y = convert_examples_to_features(train_data['document'], train_data['label'],
                                                max_seq_len=max_seq_len, tokenizer=tokenizer)

100%|██████████| 146182/146182 [00:46<00:00, 3141.39it/s]

# 테스트 데이터에 대해서 진행.
test_X, test_y = convert_examples_to_features(test_data['document'], test_data['label'],
                                              max_seq_len=max_seq_len, tokenizer=tokenizer)

100%|██████████| 49997/49997 [00:14<00:00, 3436.79it/s]

# 훈련 데이터 첮번째 샘플 확인
input_id = train_X[0][0]
attention_mask = train_X[1][0]
token_type_id = train_X[2][0]
label = train_y[0]

print('단어에 대한 정수 인코딩:', input_id)
print('어텐션 마스크:', attention_mask)
print('세그먼트 인코딩:', token_type_id)
print('각 인코딩 의 길이:', len(input_id))
print('정수 인코딩 복원:', tokenizer.decode(input_id))
print('레이블 :',label)

단어에 대한 정수 인코딩: [   2 1376  831 2604   18   18 4229 9801 2075 2203 2182 4243    3    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
어텐션 마스크: [1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
세그먼트 인코딩: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
각 인코딩 의 길이: 128
정수 인코딩 복원: [CLS] 아 더빙.. 진짜 짜증나네요 목소리 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
레이블 : 0

BERT의 출력 이해하기

model =TFBertModel.from_pretrained('klue/bert-base',from_pt=True)

# BERT의 출력을 outputs 라는 변수에 저장

max_seq_len =128
input_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype =tf.int32)
attention_masks_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype =tf.int32)
token_type_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype =tf.int32)



outputs = model([input_ids_layer, attention_masks_layer, token_type_ids_layer])

outputs[0] 은 (batch size, 128, 768)
문장의 길이 개수만큼의 출력. Many-to-Many 태스크의 경우 outputs[0] 을 사용

outputs[1] 은 (batch size, 768)
[CLS] 토큰 위치의 출력. Many-to-One 태스크의 경우 outputs[1] 을 사용.
지금과 같은 영화리뷰 분류 문제는 이에 해당

BERT를 이용한 Many-to-One 모델 만들기

class TFBertForSequenceClassification(tf.keras.Model):
  def __init__(self, model_name):
    super(TFBertForSequenceClassification, self).__init__()
    self.bert = TFBertModel.from_pretrained(model_name, from_pt=True)
    self.classifier= tf.keras.layers.Dense(1,
                                            kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02), # 가중치 초기화 (평균 0 , 표준편차 0.02)
                                            activation='sigmoid',
                                            name='classifier')
  def call(self,inputs):
    input_ids, attention_mask, token_type_ids = inputs
    outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    cls_token = outputs[1]
    prediction = self.classifier(cls_token)
    return prediction
    
model = TFBertForSequenceClassification('klue/bert-base')
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.BinaryCrossentropy()
model.compile(optimizer=optimizer, loss=loss, metrics = ['accuracy'])

model.fit(train_X, train_y, epochs=2, batch_size=64, validation_split=0.2)

예측

def sentiment_predict(new_sentence):
  # 세가지 입력 데이터 준비

  # 1.정수인코딩 + 패딩
  input_id = tokenizer.encode(new_sentence,
                              padding='max_length', max_length=max_seq_len, truncation=True)

  # 2.어텐션 마스크
  padding_count = input_id.count(tokenizer.pad_token_id)
  attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count

  # 3. 세그먼트 인코딩
  token_type_id = [0] * max_seq_len

  # 위 입력 데이터를 numpy 로 변환
  input_ids = np.array([input_id])  # <- 2차원 데이터로!
  attention_masks = np.array([attention_mask])
  token_type_ids = np.array([token_type_id])

  encoded_input = [input_ids, attention_masks, token_type_ids] # 입력 시퀀스 준비
  score = model.predict(encoded_input)[0][0] # 첫번째 batch 의 출력값 0.0 ~ 1.0

  if score > 0.5:
    print("{:.2f}% 확률로 긍정 리뷰입니다.\n".format(score * 100))
  else:
    print("{:.2f}% 확률로 부정 리뷰입니다.\n".format((1 - score) * 100))

input_sentiments = [
    '보던거라 계속보고있는데 전개도 느리고 주인공인 은희는 한두컷 나오면서 소극적인모습에 ',
    "스토리는 확실히 실망이였지만 배우들 연기력이 대박이였다 특히 이제훈 연기 정말 ... 이 배우들로 이렇게밖에 만들지 못한 영화는 아쉽지만 배우들 연기력과 사운드는 정말 빛났던 영화. 기대하고 극장에서 보면 많이 실망했겠지만 평점보고 기대없이 집에서 편하게 보면 괜찮아요. 이제훈님 연기력은 최고인 것 같습니다",
    "남친이 이 영화를 보고 헤어지자고한 영화. 자유롭게 살고 싶다고 한다. 내가 무슨 나비를 잡은 덫마냥 나에겐 다시 보고싶지 않은 영화.",
    "이 영화 존잼입니다 대박",
    '이 영화 개꿀잼 ㅋㅋㅋ',
    '이 영화 핵노잼 ㅠㅠ',
    '감독 뭐하는 놈이냐?',
    '와 개쩐다 정말 세계관 최강자들의 영화다',
]
for sentiment in input_sentiments:
  print(sentiment)
  sentiment_predict(sentiment)
  print('🟦' * 20)

보던거라 계속보고있는데 전개도 느리고 주인공인 은희는 한두컷 나오면서 소극적인모습에 
1/1 [==============================] - 3s 3s/step
97.74% 확률로 부정 리뷰입니다.

🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦
스토리는 확실히 실망이였지만 배우들 연기력이 대박이였다 특히 이제훈 연기 정말 ... 이 배우들로 이렇게밖에 만들지 못한 영화는 아쉽지만 배우들 연기력과 사운드는 정말 빛났던 영화. 기대하고 극장에서 보면 많이 실망했겠지만 평점보고 기대없이 집에서 편하게 보면 괜찮아요. 이제훈님 연기력은 최고인 것 같습니다
1/1 [==============================] - 0s 58ms/step
99.66% 확률로 긍정 리뷰입니다.

🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦
남친이 이 영화를 보고 헤어지자고한 영화. 자유롭게 살고 싶다고 한다. 내가 무슨 나비를 잡은 덫마냥 나에겐 다시 보고싶지 않은 영화.
1/1 [==============================] - 0s 57ms/step
70.67% 확률로 긍정 리뷰입니다.

🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦
이 영화 존잼입니다 대박
1/1 [==============================] - 0s 56ms/step
97.99% 확률로 긍정 리뷰입니다.

🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦
이 영화 개꿀잼 ㅋㅋㅋ
1/1 [==============================] - 0s 56ms/step
98.38% 확률로 긍정 리뷰입니다.

🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦
이 영화 핵노잼 ㅠㅠ
1/1 [==============================] - 0s 58ms/step
98.16% 확률로 부정 리뷰입니다.

🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦
감독 뭐하는 놈이냐?
1/1 [==============================] - 0s 56ms/step
92.58% 확률로 부정 리뷰입니다.

🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦
와 개쩐다 정말 세계관 최강자들의 영화다
1/1 [==============================] - 0s 57ms/step
97.16% 확률로 긍정 리뷰입니다.

🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦🟦

'AI > ML' 카테고리의 다른 글

BERT (2) (0)	2026.05.03
BERT(1) (0)	2026.05.02
BERT - 발전흐름 (0)	2026.05.01
트랜스포머 가계도 (0)	2026.04.30
디코더 (Decoder) 구조 (0)	2026.04.29

phg

KoBERT를 이용한 네이버 영화 리뷰 분류하기

BERT의 입력

정수 인코딩 + 패딩

세그먼트 인코딩

어텐션 마스크 인코딩

입력된 전체 데이터에 대해서 이과정을 진행하는 함수

BERT의 출력 이해하기

BERT를 이용한 Many-to-One 모델 만들기

예측

'AI > ML' 카테고리의 다른 글

티스토리툴바

KoBERT를 이용한 네이버 영화 리뷰 분류하기

BERT의 입력

정수 인코딩 + 패딩

세그먼트 인코딩

어텐션 마스크 인코딩

입력된 전체 데이터에 대해서 이과정을 진행하는 함수

BERT의 출력 이해하기

BERT를 이용한 Many-to-One 모델 만들기

예측

'AI > ML' 카테고리의 다른 글

관련글

티스토리툴바