[nltk] nltk tokenizer 사용 중 nltk LookupError 해결 (nltk.tokenize.word_tokenize)

문제 상황

 from pycocoevalcap.cider.cider import Cider
import matplotlib.pyplot as plt
import numpy as np
import nltk
import re
import os
import torch
from nltk.tokenize import word_tokenize
 
# NLTK 데이터 다운로드
nltk.download('punkt')
 
# 캡션 전처리 함수
def preprocess_caption(caption):
    # 소문자 변환
    caption = caption.lower()
    # 특수문자 제거
    caption = re.sub(r'[^\w\s]', '', caption)
    # 토크나이즈
    tokens = word_tokenize(caption)
    # 문자열로 다시 결합
    return ' '.join(tokens)
 
# CIDEr 평가 함수
def evaluate_and_visualize_with_cider(model, test_loader, word_index, device, image_dir, captions_dict, num_display=5):
    """
    CIDEr 점수 계산 및 결과 시각화
 
    Args:
        model: 캡션 생성 모델
        test_loader: 테스트 데이터 로더
        word_index: 단어-인덱스 매핑
        device: PyTorch 디바이스 (CPU/GPU)
        image_dir: 이미지 디렉토리 경로
        captions_dict: 이미지 이름과 참조 캡션 매핑 딕셔너리
        num_display: 시각화할 이미지 수
    """
    model.eval()
    index_to_word = {idx: word for word, idx in word_index.items()}
    results = []
    test_examples = []
    
    with torch.no_grad():
        for batch_idx, (features, captions, image_names_batch) in enumerate(test_loader):
            features = features.to(device)
            
            for i in range(features.size(0)):  # 배치 크기만큼 반복
                # Get the image name
                image_name = image_names_batch[i]
 
                # 캡션 생성
                generated_caption = generate_caption(model, features[i], word_index)
                # 전처리된 캡션
                generated_caption_proc = preprocess_caption(generated_caption)
 
                # 참조 캡션 가져오기 및 전처리
                references = captions_dict.get(image_name, [])
                references_proc = [preprocess_caption(ref) for ref in references]
 
                # 평가 데이터 준비
                results.append({
                    "image_id": image_name,
                    "candidate": generated_caption_proc,
                    "references": references_proc
                })
 
                # 테스트 예시 저장 (num_display 개수만 저장)
                if len(test_examples) < num_display:
                    image_path = os.path.join(image_dir, image_name)
                    test_examples.append({
                        "image_path": image_path,
                        "generated": generated_caption,
                        "references": references
                    })
    
    # CIDEr 점수 계산
    print("Calculating CIDEr scores...")
    cider_scorer = Cider()
    
    # gts와 res 딕셔너리 생성
    gts = {}
    res = {}
    for res_item in results:
        image_id = res_item["image_id"]
        gts[image_id] = res_item["references"]  # 리스트 형태의 참조 캡션들
        res[image_id] = [res_item["candidate"]]  # 생성된 캡션을 리스트로 감싸서 전달
 
    # CIDEr 점수 계산
    cider_score, cider_scores = cider_scorer.compute_score(gts, res)
    
    avg_cider_score = cider_score  # 평균 CIDEr 점수
    print(f"\nCIDEr Metric Evaluation:")
    print(f"Average CIDEr Score: {avg_cider_score:.4f}")
 
    # CIDEr 점수 분포 시각화
    plt.figure(figsize=(10, 5))
    plt.hist(cider_scores, bins=50, alpha=0.7)
    plt.title("Distribution of CIDEr Scores on Test Set")
    plt.xlabel("CIDEr Score")
    plt.ylabel("Count")
    plt.show()
 
    # 예시 출력
    print("\nExample Generations:")
    for idx, example in enumerate(test_examples):
        print(f"\nExample {idx + 1}")
        print(f"Image Path: {example['image_path']}")
        print(f"Generated Caption: {example['generated']}")
        print(f"References: {example['references']}")
 
    return avg_cider_score, cider_scores
 
# CIDEr 평가 실행
print("Starting CIDEr evaluation...")
avg_cider_score, cider_scores = evaluate_and_visualize_with_cider(
    model, test_loader, word_index, device, image_dir, captions_dict
)
 
print(f"Average CIDEr Score: {avg_cider_score:.4f}")
print(f"Total samples used for CIDEr calculation: {len(cider_scores)}")

위와 같이 nltk를 이용하여 CIDEr score를 계산하려 했는데,

이미 nltk.download('punkt') 로 다운로드를 했음에도, 아래와 같은 에러가 발생함.

 ---------------------------------------------------------------------------
LookupError                               Traceback (most recent call last)
Cell In[65], line 115
    113 # CIDEr 평가 실행
    114 print("Starting CIDEr evaluation...")
--> 115 avg_cider_score, cider_scores = evaluate_and_visualize_with_cider(
    116     model, test_loader, word_index, device, image_dir, captions_dict
    117 )
    119 print(f"Average CIDEr Score: {avg_cider_score:.4f}")
    120 print(f"Total samples used for CIDEr calculation: {len(cider_scores)}")
 
Cell In[65], line 54
     52 generated_caption = generate_caption(model, features[i], word_index)
     53 # 전처리된 캡션
---> 54 generated_caption_proc = preprocess_caption(generated_caption)
     56 # 참조 캡션 가져오기 및 전처리
     57 references = captions_dict.get(image_name, [])
 
Cell In[65], line 20
     18 caption = re.sub(r'[^\w\s]', '', caption)
     19 # 토크나이즈
---> 20 tokens = word_tokenize(caption)
     21 # 문자열로 다시 결합
     22 return ' '.join(tokens)
 
File ~/.conda/envs/DL/lib/python3.10/site-packages/nltk/tokenize/__init__.py:142, in word_tokenize(text, language, preserve_line)
    127 def word_tokenize(text, language="english", preserve_line=False):
    128     """
    129     Return a tokenized copy of *text*,
    130     using NLTK's recommended word tokenizer
   (...)
    140     :type preserve_line: bool
    141     """
--> 142     sentences = [text] if preserve_line else sent_tokenize(text, language)
    143     return [
    144         token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
    145     ]
 
File ~/.conda/envs/DL/lib/python3.10/site-packages/nltk/tokenize/__init__.py:119, in sent_tokenize(text, language)
    109 def sent_tokenize(text, language="english"):
    110     """
    111     Return a sentence-tokenized copy of *text*,
    112     using NLTK's recommended sentence tokenizer
   (...)
    117     :param language: the model name in the Punkt corpus
...
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

해결

우선 아래와 같은 코드로 tokenizer가 안되는 이유를 디버깅.

 import nltk
from nltk.tokenize import word_tokenize
 
try:
    word_tokenize("Test sentence for debugging.")
    print("Tokenization successful!")
except LookupError as e:
    print("Error:", e)
    print("NLTK data path:", nltk.data.path)

 Error: 
**********************************************************************
  Resource punkt_tab not found.
  Please use the NLTK Downloader to obtain the resource:
 
  >>> import nltk
  >>> nltk.download('punkt_tab')
  
  For more information see: https://www.nltk.org/data.html
 
  Attempted to load tokenizers/punkt_tab/english/
 
  Searched in:
    - '/home/gpu_04/nltk_data'
    - '/home/gpu_04/.conda/envs/DL/nltk_data'
    - '/home/gpu_04/.conda/envs/DL/share/nltk_data'
    - '/home/gpu_04/.conda/envs/DL/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/home/gpu_04/.conda/envs/DL/nltk_data'
**********************************************************************

출력대로 nltk.download('punkt_tab') 으로 설치해주니 해결.

만약 path 문제라면, path를 추가해주면 될 것 같다.

문제 상황
해결

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

[nltk] nltk tokenizer 사용 중 nltk LookupError 해결 (nltk.tokenize.word_tokenize)

문제 상황

해결

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역

	from pycocoevalcap.cider.cider import Cider
	import matplotlib.pyplot as plt
	import numpy as np
	import nltk
	import re
	import os
	import torch
	from nltk.tokenize import word_tokenize

	# NLTK 데이터 다운로드
	nltk.download('punkt')

	# 캡션 전처리 함수
	def preprocess_caption(caption):
	# 소문자 변환
	caption = caption.lower()
	# 특수문자 제거
	caption = re.sub(r'[^\w\s]', '', caption)
	# 토크나이즈
	tokens = word_tokenize(caption)
	# 문자열로 다시 결합
	return ' '.join(tokens)

	# CIDEr 평가 함수
	def evaluate_and_visualize_with_cider(model, test_loader, word_index, device, image_dir, captions_dict, num_display=5):
	"""
	CIDEr 점수 계산 및 결과 시각화

	Args:
	model: 캡션 생성 모델
	test_loader: 테스트 데이터 로더
	word_index: 단어-인덱스 매핑
	device: PyTorch 디바이스 (CPU/GPU)
	image_dir: 이미지 디렉토리 경로
	captions_dict: 이미지 이름과 참조 캡션 매핑 딕셔너리
	num_display: 시각화할 이미지 수
	"""
	model.eval()
	index_to_word = {idx: word for word, idx in word_index.items()}
	results = []
	test_examples = []

	with torch.no_grad():
	for batch_idx, (features, captions, image_names_batch) in enumerate(test_loader):
	features = features.to(device)

	for i in range(features.size(0)): # 배치 크기만큼 반복
	# Get the image name
	image_name = image_names_batch[i]

	# 캡션 생성
	generated_caption = generate_caption(model, features[i], word_index)
	# 전처리된 캡션
	generated_caption_proc = preprocess_caption(generated_caption)

	# 참조 캡션 가져오기 및 전처리
	references = captions_dict.get(image_name, [])
	references_proc = [preprocess_caption(ref) for ref in references]

	# 평가 데이터 준비
	results.append({
	"image_id": image_name,
	"candidate": generated_caption_proc,
	"references": references_proc
	})

	# 테스트 예시 저장 (num_display 개수만 저장)
	if len(test_examples) < num_display:
	image_path = os.path.join(image_dir, image_name)
	test_examples.append({
	"image_path": image_path,
	"generated": generated_caption,
	"references": references
	})

	# CIDEr 점수 계산
	print("Calculating CIDEr scores...")
	cider_scorer = Cider()

	# gts와 res 딕셔너리 생성
	gts = {}
	res = {}
	for res_item in results:
	image_id = res_item["image_id"]
	gts[image_id] = res_item["references"] # 리스트 형태의 참조 캡션들
	res[image_id] = [res_item["candidate"]] # 생성된 캡션을 리스트로 감싸서 전달

	# CIDEr 점수 계산
	cider_score, cider_scores = cider_scorer.compute_score(gts, res)

	avg_cider_score = cider_score # 평균 CIDEr 점수
	print(f"\nCIDEr Metric Evaluation:")
	print(f"Average CIDEr Score: {avg_cider_score:.4f}")

	# CIDEr 점수 분포 시각화
	plt.figure(figsize=(10, 5))
	plt.hist(cider_scores, bins=50, alpha=0.7)
	plt.title("Distribution of CIDEr Scores on Test Set")
	plt.xlabel("CIDEr Score")
	plt.ylabel("Count")
	plt.show()

	# 예시 출력
	print("\nExample Generations:")
	for idx, example in enumerate(test_examples):
	print(f"\nExample {idx + 1}")
	print(f"Image Path: {example['image_path']}")
	print(f"Generated Caption: {example['generated']}")
	print(f"References: {example['references']}")

	return avg_cider_score, cider_scores

	# CIDEr 평가 실행
	print("Starting CIDEr evaluation...")
	avg_cider_score, cider_scores = evaluate_and_visualize_with_cider(
	model, test_loader, word_index, device, image_dir, captions_dict
	)

	print(f"Average CIDEr Score: {avg_cider_score:.4f}")
	print(f"Total samples used for CIDEr calculation: {len(cider_scores)}")

	---------------------------------------------------------------------------
	LookupError Traceback (most recent call last)
	Cell In[65], line 115
	113 # CIDEr 평가 실행
	114 print("Starting CIDEr evaluation...")
	--> 115 avg_cider_score, cider_scores = evaluate_and_visualize_with_cider(
	116 model, test_loader, word_index, device, image_dir, captions_dict
	117 )
	119 print(f"Average CIDEr Score: {avg_cider_score:.4f}")
	120 print(f"Total samples used for CIDEr calculation: {len(cider_scores)}")

	Cell In[65], line 54
	52 generated_caption = generate_caption(model, features[i], word_index)
	53 # 전처리된 캡션
	---> 54 generated_caption_proc = preprocess_caption(generated_caption)
	56 # 참조 캡션 가져오기 및 전처리
	57 references = captions_dict.get(image_name, [])

	Cell In[65], line 20
	18 caption = re.sub(r'[^\w\s]', '', caption)
	19 # 토크나이즈
	---> 20 tokens = word_tokenize(caption)
	21 # 문자열로 다시 결합
	22 return ' '.join(tokens)

	File ~/.conda/envs/DL/lib/python3.10/site-packages/nltk/tokenize/__init__.py:142, in word_tokenize(text, language, preserve_line)
	127 def word_tokenize(text, language="english", preserve_line=False):
	128 """
	129 Return a tokenized copy of text,
	130 using NLTK's recommended word tokenizer
	(...)
	140 :type preserve_line: bool
	141 """
	--> 142 sentences = [text] if preserve_line else sent_tokenize(text, language)
	143 return [
	144 token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
	145 ]

	File ~/.conda/envs/DL/lib/python3.10/site-packages/nltk/tokenize/__init__.py:119, in sent_tokenize(text, language)
	109 def sent_tokenize(text, language="english"):
	110 """
	111 Return a sentence-tokenized copy of text,
	112 using NLTK's recommended sentence tokenizer
	(...)
	117 :param language: the model name in the Punkt corpus
	...
	- '/usr/local/share/nltk_data'
	- '/usr/lib/nltk_data'
	- '/usr/local/lib/nltk_data'
	**********************************************************************

	import nltk
	from nltk.tokenize import word_tokenize

	try:
	word_tokenize("Test sentence for debugging.")
	print("Tokenization successful!")
	except LookupError as e:
	print("Error:", e)
	print("NLTK data path:", nltk.data.path)

	Error:
	**********************************************************************
	Resource punkt_tab not found.
	Please use the NLTK Downloader to obtain the resource:

	>>> import nltk
	>>> nltk.download('punkt_tab')

	For more information see: https://www.nltk.org/data.html

	Attempted to load tokenizers/punkt_tab/english/

	Searched in:
	- '/home/gpu_04/nltk_data'
	- '/home/gpu_04/.conda/envs/DL/nltk_data'
	- '/home/gpu_04/.conda/envs/DL/share/nltk_data'
	- '/home/gpu_04/.conda/envs/DL/lib/nltk_data'
	- '/usr/share/nltk_data'
	- '/usr/local/share/nltk_data'
	- '/usr/lib/nltk_data'
	- '/usr/local/lib/nltk_data'
	- '/home/gpu_04/.conda/envs/DL/nltk_data'
	**********************************************************************