문제 상황
from pycocoevalcap.cider.cider import Cider
import matplotlib.pyplot as plt
import numpy as np
import nltk
import re
import os
import torch
from nltk.tokenize import word_tokenize
# NLTK 데이터 다운로드
nltk.download('punkt')
# 캡션 전처리 함수
def preprocess_caption(caption):
# 소문자 변환
caption = caption.lower()
# 특수문자 제거
caption = re.sub(r'[^\w\s]', '', caption)
# 토크나이즈
tokens = word_tokenize(caption)
# 문자열로 다시 결합
return ' '.join(tokens)
# CIDEr 평가 함수
def evaluate_and_visualize_with_cider(model, test_loader, word_index, device, image_dir, captions_dict, num_display=5):
"""
CIDEr 점수 계산 및 결과 시각화
Args:
model: 캡션 생성 모델
test_loader: 테스트 데이터 로더
word_index: 단어-인덱스 매핑
device: PyTorch 디바이스 (CPU/GPU)
image_dir: 이미지 디렉토리 경로
captions_dict: 이미지 이름과 참조 캡션 매핑 딕셔너리
num_display: 시각화할 이미지 수
"""
model.eval()
index_to_word = {idx: word for word, idx in word_index.items()}
results = []
test_examples = []
with torch.no_grad():
for batch_idx, (features, captions, image_names_batch) in enumerate(test_loader):
features = features.to(device)
for i in range(features.size(0)): # 배치 크기만큼 반복
# Get the image name
image_name = image_names_batch[i]
# 캡션 생성
generated_caption = generate_caption(model, features[i], word_index)
# 전처리된 캡션
generated_caption_proc = preprocess_caption(generated_caption)
# 참조 캡션 가져오기 및 전처리
references = captions_dict.get(image_name, [])
references_proc = [preprocess_caption(ref) for ref in references]
# 평가 데이터 준비
results.append({
"image_id": image_name,
"candidate": generated_caption_proc,
"references": references_proc
})
# 테스트 예시 저장 (num_display 개수만 저장)
if len(test_examples) < num_display:
image_path = os.path.join(image_dir, image_name)
test_examples.append({
"image_path": image_path,
"generated": generated_caption,
"references": references
})
# CIDEr 점수 계산
print("Calculating CIDEr scores...")
cider_scorer = Cider()
# gts와 res 딕셔너리 생성
gts = {}
res = {}
for res_item in results:
image_id = res_item["image_id"]
gts[image_id] = res_item["references"] # 리스트 형태의 참조 캡션들
res[image_id] = [res_item["candidate"]] # 생성된 캡션을 리스트로 감싸서 전달
# CIDEr 점수 계산
cider_score, cider_scores = cider_scorer.compute_score(gts, res)
avg_cider_score = cider_score # 평균 CIDEr 점수
print(f"\nCIDEr Metric Evaluation:")
print(f"Average CIDEr Score: {avg_cider_score:.4f}")
# CIDEr 점수 분포 시각화
plt.figure(figsize=(10, 5))
plt.hist(cider_scores, bins=50, alpha=0.7)
plt.title("Distribution of CIDEr Scores on Test Set")
plt.xlabel("CIDEr Score")
plt.ylabel("Count")
plt.show()
# 예시 출력
print("\nExample Generations:")
for idx, example in enumerate(test_examples):
print(f"\nExample {idx + 1}")
print(f"Image Path: {example['image_path']}")
print(f"Generated Caption: {example['generated']}")
print(f"References: {example['references']}")
return avg_cider_score, cider_scores
# CIDEr 평가 실행
print("Starting CIDEr evaluation...")
avg_cider_score, cider_scores = evaluate_and_visualize_with_cider(
model, test_loader, word_index, device, image_dir, captions_dict
)
print(f"Average CIDEr Score: {avg_cider_score:.4f}")
print(f"Total samples used for CIDEr calculation: {len(cider_scores)}")
위와 같이 nltk를 이용하여 CIDEr score를 계산하려 했는데,
이미 nltk.download('punkt')
로 다운로드를 했음에도, 아래와 같은 에러가 발생함.
---------------------------------------------------------------------------
LookupError Traceback (most recent call last)
Cell In[65], line 115
113 # CIDEr 평가 실행
114 print("Starting CIDEr evaluation...")
--> 115 avg_cider_score, cider_scores = evaluate_and_visualize_with_cider(
116 model, test_loader, word_index, device, image_dir, captions_dict
117 )
119 print(f"Average CIDEr Score: {avg_cider_score:.4f}")
120 print(f"Total samples used for CIDEr calculation: {len(cider_scores)}")
Cell In[65], line 54
52 generated_caption = generate_caption(model, features[i], word_index)
53 # 전처리된 캡션
---> 54 generated_caption_proc = preprocess_caption(generated_caption)
56 # 참조 캡션 가져오기 및 전처리
57 references = captions_dict.get(image_name, [])
Cell In[65], line 20
18 caption = re.sub(r'[^\w\s]', '', caption)
19 # 토크나이즈
---> 20 tokens = word_tokenize(caption)
21 # 문자열로 다시 결합
22 return ' '.join(tokens)
File ~/.conda/envs/DL/lib/python3.10/site-packages/nltk/tokenize/__init__.py:142, in word_tokenize(text, language, preserve_line)
127 def word_tokenize(text, language="english", preserve_line=False):
128 """
129 Return a tokenized copy of *text*,
130 using NLTK's recommended word tokenizer
(...)
140 :type preserve_line: bool
141 """
--> 142 sentences = [text] if preserve_line else sent_tokenize(text, language)
143 return [
144 token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
145 ]
File ~/.conda/envs/DL/lib/python3.10/site-packages/nltk/tokenize/__init__.py:119, in sent_tokenize(text, language)
109 def sent_tokenize(text, language="english"):
110 """
111 Return a sentence-tokenized copy of *text*,
112 using NLTK's recommended sentence tokenizer
(...)
117 :param language: the model name in the Punkt corpus
...
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'
**********************************************************************
해결
우선 아래와 같은 코드로 tokenizer가 안되는 이유를 디버깅.
import nltk
from nltk.tokenize import word_tokenize
try:
word_tokenize("Test sentence for debugging.")
print("Tokenization successful!")
except LookupError as e:
print("Error:", e)
print("NLTK data path:", nltk.data.path)
Error:
**********************************************************************
Resource punkt_tab not found.
Please use the NLTK Downloader to obtain the resource:
>>> import nltk
>>> nltk.download('punkt_tab')
For more information see: https://www.nltk.org/data.html
Attempted to load tokenizers/punkt_tab/english/
Searched in:
- '/home/gpu_04/nltk_data'
- '/home/gpu_04/.conda/envs/DL/nltk_data'
- '/home/gpu_04/.conda/envs/DL/share/nltk_data'
- '/home/gpu_04/.conda/envs/DL/lib/nltk_data'
- '/usr/share/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'
- '/home/gpu_04/.conda/envs/DL/nltk_data'
**********************************************************************
출력대로 nltk.download('punkt_tab')
으로 설치해주니 해결.
만약 path 문제라면, path를 추가해주면 될 것 같다.