7-3 ๋ฌธ์ž RNN์œผ๋กœ ์„ฑ์”จ ์ƒ์„ฑํ•˜๊ธฐ ์‹ค์Šต

๋ณธ ์˜ˆ์ œ์—์„œ๋Š” RNN์œผ๋กœ ์„ฑ์”จ๋ฅผ ์ƒ์„ฑํ•˜๋Š” ๊ฐ„๋‹จํ•œ ์‹œํ€€์Šค ์˜ˆ์ธก task๋ฅผ ๋งŒ๋“ญ๋‹ˆ๋‹ค. ๊ฐ ํƒ€์ž„ ์Šคํ…์— ๋Œ€ํ•ด ์„ฑ์”จ์— ํฌํ•จ๋  ์ˆ˜ ์žˆ๋Š” ๋ฌธ์ž ์ง‘ํ•ฉ์— ๋Œ€ํ•œ ํ™•๋ฅ  ๋ถ„ํฌ๋ฅผ ๊ณ„์‚ฐํ•˜์—ฌ, ์˜ˆ์ธก์„ ํ–ฅ์ƒ์‹œํ‚ค๊ฑฐ๋‚˜ ์ƒˆ๋กœ์šด ์„ฑ์”จ๋ฅผ ์ƒ์„ฑํ•ด๋‚ผ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
ย 
์ด๋ฒˆ ์‹ค์Šต ์˜ˆ์ œ์—์„œ๋Š” ์กฐ๊ฑด์ด ์—†๋Š” SurnameGenerationModel๊ณผ RNN ์ดˆ๊ธฐ ์€๋‹‰ ์ƒํƒœ์— ํŠน์ • ๊ตญ์ ์„ ์ž„๋ฒ ๋”ฉํ•˜์—ฌ ํ™œ์šฉํ•ด ํŽธํ–ฅ์„ฑ์„ ์ค€, ์กฐ๊ฑด์ด ์žˆ๋Š” SurnameGenerationModel์„ ๋งŒ๋“ค์–ด ์ ์šฉํ•ด๋ณด๊ฒ ์Šต๋‹ˆ๋‹ค.
ย 

1. SurnameDataset ํด๋ž˜์Šค

์ด๋ฒˆ์— ํ™œ์šฉํ•  ๋ฐ์ดํ„ฐ์™€ Dataset ํด๋ž˜์Šค๋Š” ์ด์ „์— ์ˆ˜ํ–‰ํ•œ ์‹ค์Šต์ธ
6-2 RNN ์‹ค์Šต : ์„ฑ์”จ ๊ตญ์  ๋ถ„๋ฅ˜ (1)
์—์„œ ํ™œ์šฉํ•œ Dataset ํด๋ž˜์Šค์™€ ๋งค์šฐ ์œ ์‚ฌํ•ฉ๋‹ˆ๋‹ค. ๊ธฐ์กด์˜ Dataset๊ณผ ๋‹ฌ๋ผ์ง„ ์ ์€ __getitem__()์—์„œ vector ์ •๋ณด(surname_vector, nationality_index, vector_length)๊ฐ€ ์•„๋‹Œ ์˜ˆ์ธก ํƒ€๊ฒŸ์— ๋Œ€ํ•œ ์ •์ˆ˜ ์‹œํ€€์Šค, ์ถœ๋ ฅ ์ •์ˆ˜ ์‹œํ€€์Šค(from_vector, to_vector)๋ฅผ ์ถœ๋ ฅํ•œ๋‹ค๋Š” ์ ์ž…๋‹ˆ๋‹ค.
ย 

Dataset ์†Œ์Šค์ฝ”๋“œ

class SurnameDataset(Dataset): def __init__(self, surname_df, vectorizer): """ ๋งค๊ฐœ๋ณ€์ˆ˜: surname_df (pandas.DataFrame): ๋ฐ์ดํ„ฐ์…‹ vectorizer (SurnameVectorizer): ๋ฐ์ดํ„ฐ์…‹์—์„œ ๋งŒ๋“  Vectorizer ๊ฐ์ฒด """ self.surname_df = surname_df self._vectorizer = vectorizer self._max_seq_length = max(map(len, self.surname_df.surname)) + 2 self.train_df = self.surname_df[self.surname_df.split=='train'] self.train_size = len(self.train_df) self.val_df = self.surname_df[self.surname_df.split=='val'] self.validation_size = len(self.val_df) self.test_df = self.surname_df[self.surname_df.split=='test'] self.test_size = len(self.test_df) self._lookup_dict = {'train': (self.train_df, self.train_size), 'val': (self.val_df, self.validation_size), 'test': (self.test_df, self.test_size)} self.set_split('train') @classmethod def load_dataset_and_make_vectorizer(cls, surname_csv): """๋ฐ์ดํ„ฐ์…‹์„ ๋กœ๋“œํ•˜๊ณ  ์ƒˆ๋กœ์šด Vectorizer๋ฅผ ๋งŒ๋“ญ๋‹ˆ๋‹ค ๋งค๊ฐœ๋ณ€์ˆ˜: surname_csv (str): ๋ฐ์ดํ„ฐ์…‹์˜ ์œ„์น˜ ๋ฐ˜ํ™˜๊ฐ’: SurnameDataset ๊ฐ์ฒด """ surname_df = pd.read_csv(surname_csv) return cls(surname_df, SurnameVectorizer.from_dataframe(surname_df)) @classmethod def load_dataset_and_load_vectorizer(cls, surname_csv, vectorizer_filepath): """๋ฐ์ดํ„ฐ์…‹๊ณผ ์ƒˆ๋กœ์šด Vectorizer ๊ฐ์ฒด๋ฅผ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค. ์บ์‹œ๋œ Vectorizer ๊ฐ์ฒด๋ฅผ ์žฌ์‚ฌ์šฉํ•  ๋•Œ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค. ๋งค๊ฐœ๋ณ€์ˆ˜: surname_csv (str): ๋ฐ์ดํ„ฐ์…‹์˜ ์œ„์น˜ vectorizer_filepath (str): Vectorizer ๊ฐ์ฒด์˜ ์ €์žฅ ์œ„์น˜ ๋ฐ˜ํ™˜๊ฐ’: SurnameDataset์˜ ์ธ์Šคํ„ด์Šค """ surname_df = pd.read_csv(surname_csv) vectorizer = cls.load_vectorizer_only(vectorizer_filepath) return cls(surname_df, vectorizer) @staticmethod def load_vectorizer_only(vectorizer_filepath): """ํŒŒ์ผ์—์„œ Vectorizer ๊ฐ์ฒด๋ฅผ ๋กœ๋“œํ•˜๋Š” ์ •์  ๋ฉ”์„œ๋“œ ๋งค๊ฐœ๋ณ€์ˆ˜: vectorizer_filepath (str): ์ง๋ ฌํ™”๋œ Vectorizer ๊ฐ์ฒด์˜ ์œ„์น˜ ๋ฐ˜ํ™˜๊ฐ’: SurnameVectorizer์˜ ์ธ์Šคํ„ด์Šค """ with open(vectorizer_filepath) as fp: return SurnameVectorizer.from_serializable(json.load(fp)) def save_vectorizer(self, vectorizer_filepath): """Vectorizer ๊ฐ์ฒด๋ฅผ json ํ˜•ํƒœ๋กœ ๋””์Šคํฌ์— ์ €์žฅํ•ฉ๋‹ˆ๋‹ค ๋งค๊ฐœ๋ณ€์ˆ˜: vectorizer_filepath (str): Vectorizer ๊ฐ์ฒด์˜ ์ €์žฅ ์œ„์น˜ """ with open(vectorizer_filepath, "w") as fp: json.dump(self._vectorizer.to_serializable(), fp) def get_vectorizer(self): """ ๋ฒกํ„ฐ ๋ณ€ํ™˜ ๊ฐ์ฒด๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค """ return self._vectorizer def set_split(self, split="train"): self._target_split = split self._target_df, self._target_size = self._lookup_dict[split] def __len__(self): return self._target_size def __getitem__(self, index): """ํŒŒ์ดํ† ์น˜ ๋ฐ์ดํ„ฐ์…‹์˜ ์ฃผ์š” ์ง„์ž… ๋ฉ”์„œ๋“œ ๋งค๊ฐœ๋ณ€์ˆ˜: index (int): ๋ฐ์ดํ„ฐ ํฌ์ธํŠธ์— ๋Œ€ํ•œ ์ธ๋ฑ์Šค ๋ฐ˜ํ™˜๊ฐ’: ๋ฐ์ดํ„ฐ ํฌ์ธํŠธ(x_data, y_target, class_index)๋ฅผ ๋‹ด๊ณ  ์žˆ๋Š” ๋”•์…”๋„ˆ๋ฆฌ """ row = self._target_df.iloc[index] from_vector, to_vector = \ self._vectorizer.vectorize(row.surname, self._max_seq_length) nationality_index = \ self._vectorizer.nationality_vocab.lookup_token(row.nationality) return {'x_data': from_vector, 'y_target': to_vector, 'class_index': nationality_index} def get_num_batches(self, batch_size): """๋ฐฐ์น˜ ํฌ๊ธฐ๊ฐ€ ์ฃผ์–ด์ง€๋ฉด ๋ฐ์ดํ„ฐ์…‹์œผ๋กœ ๋งŒ๋“ค ์ˆ˜ ์žˆ๋Š” ๋ฐฐ์น˜ ๊ฐœ์ˆ˜๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค ๋งค๊ฐœ๋ณ€์ˆ˜: batch_size (int) ๋ฐ˜ํ™˜๊ฐ’: ๋ฐฐ์น˜ ๊ฐœ์ˆ˜ """ return len(self) // batch_size
ย 

2. ๋ฒกํ„ฐ ๋ณ€ํ™˜ ํด๋ž˜์Šค

์ด์ „๊ณผ ๋งˆ์ฐฌ๊ฐ€์ง€๋กœ 3๊ฐ€์ง€ ์ฃผ์š” ํด๋ž˜์Šค์ธ Vocabulary, Vectorizer, DataLoader๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
  1. SequenceVocabulary์—์„œ๋Š” ๊ฐœ๋ณ„ ๋ฌธ์ž ํ† ํฐ๋“ค์„ ์ •์ˆ˜๋กœ ๋งคํ•‘ํ•˜๋Š” ์ž‘์—…์„ ์ˆ˜ํ–‰ํ•ฉ๋‹ˆ๋‹ค.
  1. SurnameVectorizer์—์„œ๋Š” ์œ„์—์„œ ๋งคํ•‘ํ•œ ์ˆซ์ž๊ฐ’์œผ๋กœ ๋ฒกํ„ฐํ™”๋ฅผ ์ง„ํ–‰ํ•ฉ๋‹ˆ๋‹ค.
  1. DataLoader์—์„œ๋Š” Vectorizer์—์„œ ๋งŒ๋“ค์–ด์ง„ ๋ฒกํ„ฐ๋“ค์„ ๋ฏธ๋‹ˆ๋ฐฐ์น˜๋กœ ๋งŒ๋“ค์–ด์ค๋‹ˆ๋‹ค.
ย 
SequenceVocabulary์™€ DataLoader๋Š” ์•ž์„œ
6-2 RNN ์‹ค์Šต : ์„ฑ์”จ ๊ตญ์  ๋ถ„๋ฅ˜ (1)
์—์„œ ๋‹ค๋ฃฌ ์˜ˆ์ œ์™€ ๋™์ผํ•œ ์†Œ์Šค์ฝ”๋“œ๋ฅผ ํ™œ์šฉํ•˜๋ฏ€๋กœ, ์†Œ์Šค์ฝ”๋“œ๋งŒ ์ฒจ๋ถ€ํ•ด๋‘๊ฒ ์Šต๋‹ˆ๋‹ค.
ย 

SequenceVocabulary ์†Œ์Šค์ฝ”๋“œ

class Vocabulary(object): """๋งคํ•‘์„ ์œ„ํ•ด ํ…์ŠคํŠธ๋ฅผ ์ฒ˜๋ฆฌํ•˜๊ณ  ์–ดํœ˜ ์‚ฌ์ „์„ ๋งŒ๋“œ๋Š” ํด๋ž˜์Šค """ def __init__(self, token_to_idx=None): """ ๋งค๊ฐœ๋ณ€์ˆ˜: token_to_idx (dict): ๊ธฐ์กด ํ† ํฐ-์ธ๋ฑ์Šค ๋งคํ•‘ ๋”•์…”๋„ˆ๋ฆฌ """ if token_to_idx is None: token_to_idx = {} self._token_to_idx = token_to_idx self._idx_to_token = {idx: token for token, idx in self._token_to_idx.items()} def to_serializable(self): """ ์ง๋ ฌํ™”ํ•  ์ˆ˜ ์žˆ๋Š” ๋”•์…”๋„ˆ๋ฆฌ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค """ return {'token_to_idx': self._token_to_idx} @classmethod def from_serializable(cls, contents): """ ์ง๋ ฌํ™”๋œ ๋”•์…”๋„ˆ๋ฆฌ์—์„œ Vocabulary ๊ฐ์ฒด๋ฅผ ๋งŒ๋“ญ๋‹ˆ๋‹ค """ return cls(**contents) def add_token(self, token): """ ํ† ํฐ์„ ๊ธฐ๋ฐ˜์œผ๋กœ ๋งคํ•‘ ๋”•์…”๋„ˆ๋ฆฌ๋ฅผ ์—…๋ฐ์ดํŠธํ•ฉ๋‹ˆ๋‹ค ๋งค๊ฐœ๋ณ€์ˆ˜: token (str): Vocabulary์— ์ถ”๊ฐ€ํ•  ํ† ํฐ ๋ฐ˜ํ™˜๊ฐ’: index (int): ํ† ํฐ์— ์ƒ์‘ํ•˜๋Š” ์ •์ˆ˜ """ if token in self._token_to_idx: index = self._token_to_idx[token] else: index = len(self._token_to_idx) self._token_to_idx[token] = index self._idx_to_token[index] = token return index def add_many(self, tokens): """ํ† ํฐ ๋ฆฌ์ŠคํŠธ๋ฅผ Vocabulary์— ์ถ”๊ฐ€ํ•ฉ๋‹ˆ๋‹ค. ๋งค๊ฐœ๋ณ€์ˆ˜: tokens (list): ๋ฌธ์ž์—ด ํ† ํฐ ๋ฆฌ์ŠคํŠธ ๋ฐ˜ํ™˜๊ฐ’: indices (list): ํ† ํฐ ๋ฆฌ์ŠคํŠธ์— ์ƒ์‘๋˜๋Š” ์ธ๋ฑ์Šค ๋ฆฌ์ŠคํŠธ """ return [self.add_token(token) for token in tokens] def lookup_token(self, token): """ํ† ํฐ์— ๋Œ€์‘ํ•˜๋Š” ์ธ๋ฑ์Šค๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค. ๋งค๊ฐœ๋ณ€์ˆ˜: token (str): ์ฐพ์„ ํ† ํฐ ๋ฐ˜ํ™˜๊ฐ’: index (int): ํ† ํฐ์— ํ•ด๋‹นํ•˜๋Š” ์ธ๋ฑ์Šค """ return self._token_to_idx[token] def lookup_index(self, index): """ ์ธ๋ฑ์Šค์— ํ•ด๋‹นํ•˜๋Š” ํ† ํฐ์„ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค. ๋งค๊ฐœ๋ณ€์ˆ˜: index (int): ์ฐพ์„ ์ธ๋ฑ์Šค ๋ฐ˜ํ™˜๊ฐ’: token (str): ์ธํ…์Šค์— ํ•ด๋‹นํ•˜๋Š” ํ† ํฐ ์—๋Ÿฌ: KeyError: ์ธ๋ฑ์Šค๊ฐ€ Vocabulary์— ์—†์„ ๋•Œ ๋ฐœ์ƒํ•ฉ๋‹ˆ๋‹ค. """ if index not in self._idx_to_token: raise KeyError("the index (%d) is not in the Vocabulary" % index) return self._idx_to_token[index] def __str__(self): return "<Vocabulary(size=%d)>" % len(self) def __len__(self): return len(self._token_to_idx)
class SequenceVocabulary(Vocabulary): def __init__(self, token_to_idx=None, unk_token="<UNK>", mask_token="<MASK>", begin_seq_token="<BEGIN>", end_seq_token="<END>"): super(SequenceVocabulary, self).__init__(token_to_idx) self._mask_token = mask_token self._unk_token = unk_token self._begin_seq_token = begin_seq_token self._end_seq_token = end_seq_token self.mask_index = self.add_token(self._mask_token) self.unk_index = self.add_token(self._unk_token) self.begin_seq_index = self.add_token(self._begin_seq_token) self.end_seq_index = self.add_token(self._end_seq_token) def to_serializable(self): contents = super(SequenceVocabulary, self).to_serializable() contents.update({'unk_token': self._unk_token, 'mask_token': self._mask_token, 'begin_seq_token': self._begin_seq_token, 'end_seq_token': self._end_seq_token}) return contents def lookup_token(self, token): """ ํ† ํฐ์— ๋Œ€์‘ํ•˜๋Š” ์ธ๋ฑ์Šค๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค. ํ† ํฐ์ด ์—†์œผ๋ฉด UNK ์ธ๋ฑ์Šค๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค. ๋งค๊ฐœ๋ณ€์ˆ˜: token (str): ์ฐพ์„ ํ† ํฐ ๋ฐ˜ํ™˜๊ฐ’: index (int): ํ† ํฐ์— ํ•ด๋‹นํ•˜๋Š” ์ธ๋ฑ์Šค ๋…ธํŠธ: UNK ํ† ํฐ์„ ์‚ฌ์šฉํ•˜๋ ค๋ฉด (Vocabulary์— ์ถ”๊ฐ€ํ•˜๊ธฐ ์œ„ํ•ด) `unk_index`๊ฐ€ 0๋ณด๋‹ค ์ปค์•ผ ํ•ฉ๋‹ˆ๋‹ค. """ if self.unk_index >= 0: return self._token_to_idx.get(token, self.unk_index) else: return self._token_to_idx[token]

DataLoader ์†Œ์Šค์ฝ”๋“œ

def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"): """ ํŒŒ์ดํ† ์น˜ DataLoader๋ฅผ ๊ฐ์‹ธ๊ณ  ์žˆ๋Š” ์ œ๋„ˆ๋ ˆ์ดํ„ฐ ํ•จ์ˆ˜. ๊ฑฑ ํ…์„œ๋ฅผ ์ง€์ •๋œ ์žฅ์น˜๋กœ ์ด๋™ํ•ฉ๋‹ˆ๋‹ค. """ dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last) for data_dict in dataloader: out_data_dict = {} for name, tensor in data_dict.items(): out_data_dict[name] = data_dict[name].to(device) yield out_data_dict
ย 

SurnameVectorizer

์‹œํ€€์Šค ์˜ˆ์ธก task์—์„œ๋Š” ๊ฐ ํƒ€์ž„ ์Šคํ…(์ž…๋ ฅ ์ฐจ๋ก€)๋งˆ๋‹ค ํ† ํฐ ์ƒ˜ํ”Œ๊ณผ ํ† ํฐ ํƒ€๊นƒ์— ๋Œ€ํ•œ ์ •์ˆ˜ ์‹œํ€€์Šค 2๊ฐœ๋ฅผ ์ž…๋ ฅ์œผ๋กœ ๋ฐ›์Šต๋‹ˆ๋‹ค. ์ฃผ๋กœ ํ•˜๋‚˜์˜ ํ† ํฐ ์‹œํ€€์Šค์— ๋Œ€ํ•ด ํ† ํฐ์„ ํ•˜๋‚˜์”ฉ ์—‡๊ฐˆ๋ฆฌ๊ฒŒ ํ•˜์—ฌ ์ƒ˜ํ”Œ๊ณผ ํƒ€๊นƒ์„ ๊ตฌ์„ฑํ•ฉ๋‹ˆ๋‹ค. ์ด ๊ณผ์ •์€ ๋‹ค์Œ๊ณผ ๊ฐ™์Šต๋‹ˆ๋‹ค.
  1. SequenceVocabulary์—์„œ ํ† ํฐ์„ ์ ์ ˆํ•œ ์ธ๋ฑ์Šค์— ๋งคํ•‘ํ•˜๊ธฐ
  1. BEGIN-OF-SEQUENCE, END-OF-SEQUENCE ํ† ํฐ์— ํ•ด๋‹นํ•˜๋Š” ์ธ๋ฑ์Šค ๋ฒˆํ˜ธ๋ฅผ ์‹œํ€€์Šค ์•ž ๋’ค์— ์ถ”๊ฐ€ํ•˜๊ธฐ (์ด์ œ ๋ชจ๋“  ๋ฐ์ดํ„ฐ๋Š” ์ฒซ๋ฒˆ์งธ์™€ ๋งˆ์ง€๋ง‰ ์ธ๋ฑ์Šค๊ฐ€ ๋™์ผํ•œ ์‹œํ€€์Šค๊ฐ€ ๋ฉ๋‹ˆ๋‹ค)
  1. ํ† ํฐ ์‹œํ€€์Šค์˜ ๋งˆ์ง€๋ง‰์„ ์ œ์™ธํ•œ ๋ชจ๋“  ์‹œํ€€์Šค ํ† ํฐ์„ ํฌํ•จํ•˜๋„๋ก ์ž˜๋ผ ์ž…๋ ฅ ์‹œํ€€์Šค ๋งŒ๋“ค๊ธฐ ์ž…๋ ฅ ์‹œํ€€์Šค์— ํฌํ•จ๋˜์ง€ ์•Š๊ณ  ์ œ์™ธ๋œ ๋ถ€๋ถ„์€ mask ํ† ํฐ์˜ ์ธ๋ฑ์Šค๋กœ ์ฑ„์›๋‹ˆ๋‹ค
  1. ํ† ํฐ ์‹œํ€€์Šค์˜ ์ฒซ๋ฒˆ์งธ๋ฅผ ์ œ์™ธํ™˜ ๋ชจ๋“  ์‹œํ€€์Šค ํ† ํฐ์„ ํฌํ•จํ•˜๋„๋ก ์ž˜๋ผ ์ถœ๋ ฅ ์‹œํ€€์Šค ๋งŒ๋“ค๊ธฐ ์ถœ๋ ฅ ์‹œํ€€์Šค์— ํฌํ•จ๋˜์ง€ ์•Š๊ณ  ์ œ์™ธ๋œ ๋ถ€๋ถ„์€ mask ํ† ํฐ์˜ ์ธ๋ฑ์Šค๋กœ ์ฑ„์›๋‹ˆ๋‹ค
ย 
์œ„ ๊ณผ์ •์„ SurnameVectorizer์˜ vectorizer() ๋ฉ”์„œ๋“œ์—์„œ ์ˆ˜ํ–‰ํ•˜๊ฒŒ ๋ฉ๋‹ˆ๋‹ค.
def vectorize(self, surname, vector_length=-1): """ ์„ฑ์”จ๋ฅผ ์ƒ˜ํ”Œ๊ณผ ํƒ€๊นƒ ๋ฒกํ„ฐ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค ์„ฑ์”จ ๋ฒกํ„ฐ๋ฅผ ๋‘ ๊ฐœ์˜ ๋ฒกํ„ฐ surname[:-1]์™€ surname[1:]๋กœ ๋‚˜๋ˆ„์–ด ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค. ๊ฐ ํƒ€์ž„์Šคํ…์—์„œ ์ฒซ ๋ฒˆ์งธ ๋ฒกํ„ฐ๊ฐ€ ์ƒ˜ํ”Œ์ด๊ณ  ๋‘ ๋ฒˆ์งธ ๋ฒกํ„ฐ๊ฐ€ ํƒ€๊นƒ์ž…๋‹ˆ๋‹ค. ๋งค๊ฐœ๋ณ€์ˆ˜: surname (str): ๋ฒกํ„ฐ๋กœ ๋ณ€๊ฒฝํ•  ์„ฑ์”จ vector_length (int): ์ธ๋ฑ์Šค ๋ฒกํ„ฐ์˜ ๊ธธ์ด๋ฅผ ๋งž์ถ”๊ธฐ ์œ„ํ•œ ๋งค๊ฐœ๋ณ€์ˆ˜ ๋ฐ˜ํ™˜๊ฐ’: ํŠœํ”Œ: (from_vector, to_vector) from_vector (numpy.ndarray): ์ƒ˜ํ”Œ ๋ฒกํ„ฐ to_vector (numpy.ndarray): ํƒ€๊นƒ ๋ฒกํ„ฐ vector """ # ์‹œํ€€์Šค์˜ ์•ž ๋’ค์— BEGIN index์™€ END index๋ฅผ ๋ถ™์—ฌ์ค๋‹ˆ๋‹ค # Vocabulary์—์„œ ํ† ํฐ์— ๋งคํ•‘๋œ index๋ฅผ ์ฐพ์•„ ๋„ฃ์–ด์ค๋‹ˆ๋‹ค indices = [self.char_vocab.begin_seq_index] indices.extend(self.char_vocab.lookup_token(token) for token in surname) indices.append(self.char_vocab.end_seq_index) if vector_length < 0: vector_length = len(indices) - 1 # ๋งˆ์ง€๋ง‰ ์‹œํ€€์Šค๋ฅผ ์ œ์™ธํ•œ ๋ชจ๋“  ์‹œํ€€์Šค๋ฅผ ํฌํ•จํ•˜๋„๋ก ์ž˜๋ผ ์ž…๋ ฅ ์‹œํ€€์Šค๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค # ํฌํ•จ๋˜์ง€ ์•Š์€ ์‹œํ€€์Šค๋Š” mask ํ† ํฐ์˜ index๋กœ ๊ฐ€๋ ค์ง‘๋‹ˆ๋‹ค from_vector = np.empty(vector_length, dtype=np.int64) from_indices = indices[:-1] from_vector[:len(from_indices)] = from_indices from_vector[len(from_indices):] = self.char_vocab.mask_index # ์ฒซ๋ฒˆ์งธ ์‹œํ€€์Šค๋ฅผ ์ œ์™ธํ•œ ๋ชจ๋“  ์‹œํ€€์Šค๋ฅผ ํฌํ•จํ•˜๋„๋ก ์ž˜๋ผ ์ถœ๋ ฅ ์‹œํ€€์Šค๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค # ํฌํ•จ๋˜์ง€ ์•Š์€ ์‹œํ€€์Šค๋Š” mask ํ† ํฐ์˜ index๋กœ ๊ฐ€๋ ค์ง‘๋‹ˆ๋‹ค to_vector = np.empty(vector_length, dtype=np.int64) to_indices = indices[1:] to_vector[:len(to_indices)] = to_indices to_vector[len(to_indices):] = self.char_vocab.mask_index # ์ƒ์„ฑํ•œ ์‹œํ€€์Šค ์Œ - ์ž…๋ ฅ ์‹œํ€€์Šค์™€ ์ถœ๋ ฅ ์‹œํ€€์Šค - ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค return from_vector, to_vector
ย 

SurnameVectorizer ์†Œ์Šค์ฝ”๋“œ

class SurnameVectorizer(object): """ ์–ดํœ˜ ์‚ฌ์ „์„ ์ƒ์„ฑํ•˜๊ณ  ๊ด€๋ฆฌํ•ฉ๋‹ˆ๋‹ค """ def __init__(self, char_vocab, nationality_vocab): """ ๋งค๊ฐœ๋ณ€์ˆ˜: char_vocab (Vocabulary): ๋ฌธ์ž๋ฅผ ์ •์ˆ˜๋กœ ๋งคํ•‘ํ•ฉ๋‹ˆ๋‹ค nationality_vocab (Vocabulary): ๊ตญ์ ์„ ์ •์ˆ˜๋กœ ๋งคํ•‘ํ•ฉ๋‹ˆ๋‹ค """ self.char_vocab = char_vocab self.nationality_vocab = nationality_vocab def vectorize(self, surname, vector_length=-1): """ ์„ฑ์”จ๋ฅผ ์ƒ˜ํ”Œ๊ณผ ํƒ€๊นƒ ๋ฒกํ„ฐ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค ์„ฑ์”จ ๋ฒกํ„ฐ๋ฅผ ๋‘ ๊ฐœ์˜ ๋ฒกํ„ฐ surname[:-1]์™€ surname[1:]๋กœ ๋‚˜๋ˆ„์–ด ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค. ๊ฐ ํƒ€์ž„์Šคํ…์—์„œ ์ฒซ ๋ฒˆ์งธ ๋ฒกํ„ฐ๊ฐ€ ์ƒ˜ํ”Œ์ด๊ณ  ๋‘ ๋ฒˆ์งธ ๋ฒกํ„ฐ๊ฐ€ ํƒ€๊นƒ์ž…๋‹ˆ๋‹ค. ๋งค๊ฐœ๋ณ€์ˆ˜: surname (str): ๋ฒกํ„ฐ๋กœ ๋ณ€๊ฒฝํ•  ์„ฑ์”จ vector_length (int): ์ธ๋ฑ์Šค ๋ฒกํ„ฐ์˜ ๊ธธ์ด๋ฅผ ๋งž์ถ”๊ธฐ ์œ„ํ•œ ๋งค๊ฐœ๋ณ€์ˆ˜ ๋ฐ˜ํ™˜๊ฐ’: ํŠœํ”Œ: (from_vector, to_vector) from_vector (numpy.ndarray): ์ƒ˜ํ”Œ ๋ฒกํ„ฐ to_vector (numpy.ndarray): ํƒ€๊นƒ ๋ฒกํ„ฐ vector """ indices = [self.char_vocab.begin_seq_index] indices.extend(self.char_vocab.lookup_token(token) for token in surname) indices.append(self.char_vocab.end_seq_index) if vector_length < 0: vector_length = len(indices) - 1 from_vector = np.empty(vector_length, dtype=np.int64) from_indices = indices[:-1] from_vector[:len(from_indices)] = from_indices from_vector[len(from_indices):] = self.char_vocab.mask_index to_vector = np.empty(vector_length, dtype=np.int64) to_indices = indices[1:] to_vector[:len(to_indices)] = to_indices to_vector[len(to_indices):] = self.char_vocab.mask_index return from_vector, to_vector @classmethod def from_dataframe(cls, surname_df): """๋ฐ์ดํ„ฐ์…‹ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์œผ๋กœ ๊ฐ์ฒด๋ฅผ ์ดˆ๊ธฐํ™”ํ•ฉ๋‹ˆ๋‹ค ๋งค๊ฐœ๋ณ€์ˆ˜: surname_df (pandas.DataFrame): ์„ฑ์”จ ๋ฐ์ดํ„ฐ์…‹ ๋ฐ˜ํ™˜๊ฐ’: SurnameVectorizer ๊ฐ์ฒด """ char_vocab = SequenceVocabulary() nationality_vocab = Vocabulary() for index, row in surname_df.iterrows(): for char in row.surname: char_vocab.add_token(char) nationality_vocab.add_token(row.nationality) return cls(char_vocab, nationality_vocab) @classmethod def from_serializable(cls, contents): """ํŒŒ์ผ์—์„œ SurnameVectorizer ๊ฐ์ฒด๋ฅผ ์ดˆ๊ธฐํ™”ํ•ฉ๋‹ˆ๋‹ค ๋งค๊ฐœ๋ณ€์ˆ˜: contents (dict): SurnameVectorizer๋ฅผ ์œ„ํ•ด ๋‘ ๊ฐœ์˜ ์–ดํœ˜ ์‚ฌ์ „์„ ๋‹ด์€ ๋”•์…”๋„ˆ๋ฆฌ ์ด ๋”•์…”๋„ˆ๋ฆฌ๋Š” `vectorizer.to_serializable()`๋ฅผ ์‚ฌ์šฉํ•ด ๋งŒ๋“ญ๋‹ˆ๋‹ค ๋ฐ˜ํ™˜๊ฐ’: SurnameVectorizer์˜ ๊ฐ์ฒด """ char_vocab = SequenceVocabulary.from_serializable(contents['char_vocab']) nat_vocab = Vocabulary.from_serializable(contents['nationality_vocab']) return cls(char_vocab=char_vocab, nationality_vocab=nat_vocab) def to_serializable(self): """ ์ง๋ ฌํ™”๋œ ๊ฒฐ๊ณผ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค """ return {'char_vocab': self.char_vocab.to_serializable(), 'nationality_vocab': self.nationality_vocab.to_serializable()}
ย 

3. ๋ชจ๋ธ 1) ์กฐ๊ฑด ์—†๋Š” SurnameGenerationModel

๋’ค์—์„œ ๋‹ค๋ฃฐ ๋ชจ๋ธ 2๋Š” ์€๋‹‰ ์ƒํƒœ(์€๋‹‰ ๋ฒกํ„ฐ)์— ๋ฏธ๋ฆฌ ๊ฐ’์„ ์ €์žฅํ•ด ํŽธํ–ฅ๋œ ๊ณ„์‚ฐ์„ ํ•˜๋„๋ก ์œ ๋„ํ•˜์ง€๋งŒ, ๋ณธ ๋ชจ๋ธ์—์„œ๋Š” ์ดˆ๊ธฐ๊ฐ’์„ 0์œผ๋กœ ์„ค์ •ํ•ด ์€๋‹‰ ์ƒํƒœ์˜ ์˜ํ–ฅ๋ ฅ์„ ์—†์• ๊ณ  ์‹œ์ž‘ํ•ฉ๋‹ˆ๋‹ค.
์—ฌ๊ธฐ์—์„œ
7-2 ๊ฒŒ์ดํŒ… : LSTM, GRU
์—์„œ ๋‹ค๋ค˜๋˜ GRU๋ฅผ self.rnn์— ๋„ฃ์–ด์ฃผ๋ฉฐ ๋ชจ๋ธ ๊ตฌ์„ฑ์— ํ™œ์šฉํ•˜๋Š” ๊ฒƒ์„ ์•„๋ž˜ ์ฝ”๋“œ์—์„œ ํ™•์ธํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ๊ตฌ์„ฑ ์š”์†Œ๋ฅผ ๋ฐ”๊พธ๋Š” ์ž‘์—…์€ ์ด์ฒ˜๋Ÿผ ํฌ๊ฒŒ ์–ด๋ ต์ง€ ์•Š์œผ๋ฉฐ, LSTM ์—ญ์‹œ ๋น„์Šทํ•œ ๋ฐฉ์‹์œผ๋กœ ๋ชจ๋ธ์˜ ๊ตฌ์„ฑ์š”์†Œ๋กœ ๋„ฃ์–ด์ค„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
ย 
__init__ ํ•จ์ˆ˜์—์„œ๋Š” ๋ณธ ๋ชจ๋ธ์˜ ์ž„๋ฒ ๋”ฉ ์ธต, GRU, Linear์ธต์„ ์ดˆ๊ธฐํ™”ํ•ด์ค๋‹ˆ๋‹ค. ์ž„๋ฒ ๋”ฉ ์ธต์€ ์ •์ˆ˜๋ฅผ 3์ฐจ์›์˜ ํ…์„œ๋กœ ๋ณ€ํ™˜์‹œ์ผœ์ฃผ๊ณ , ์ด ํ…์„œ๊ฐ€ GRU๋ฅผ ํ†ต๊ณผํ•˜๋ฉฐ ์ƒํƒœ ๋ฒกํ„ฐ๊ฐ€ ์—ฐ์‚ฐ๋˜๊ฒŒ ๋ฉ๋‹ˆ๋‹ค.
์•„๋ž˜ ์ฝ”๋“œ์˜ forward ํ•จ์ˆ˜์—์„œ ๋ณผ ์ˆ˜ ์žˆ๋“ฏ์ด, ๋ฌธ์ž ์‹œํ€€์Šค๋ฅผ ๋ฐ›์•„์™€ ์ž„๋ฒ ๋”ฉํ•˜์—ฌ rnn(GRU)๋ฅผ ํ†ตํ•ด ์ƒํƒœ๋ฅผ ์ˆœ์ฐจ์ ์œผ๋กœ ๊ณ„์‚ฐํ•ฉ๋‹ˆ๋‹ค. ์ดํ›„, linear์ธต(fc)์—์„œ ์˜ˆ์ธก ํ™•๋ฅ ์„ ๊ณ„์‚ฐํ•˜๊ฒŒ ๋ฉ๋‹ˆ๋‹ค.
ย 
class SurnameGenerationModel(nn.Module): def __init__(self, char_embedding_size, char_vocab_size, rnn_hidden_size, batch_first=True, padding_idx=0, dropout_p=0.5): """ ๋งค๊ฐœ๋ณ€์ˆ˜: char_embedding_size (int): ๋ฌธ์ž ์ž„๋ฒ ๋”ฉ ํฌ๊ธฐ char_vocab_size (int): ์ž„๋ฒ ๋”ฉ๋  ๋ฌธ์ž ๊ฐœ์ˆ˜ rnn_hidden_size (int): RNN์˜ ์€๋‹‰ ์ƒํƒœ ํฌ๊ธฐ batch_first (bool): 0๋ฒˆ์งธ ์ฐจ์›์ด ๋ฐฐ์น˜์ธ์ง€ ์‹œํ€€์Šค์ธ์ง€ ๋‚˜ํƒ€๋‚ด๋Š” ํ”Œ๋ž˜๊ทธ padding_idx (int): ํ…์„œ ํŒจ๋”ฉ์„ ์œ„ํ•œ ์ธ๋ฑ์Šค; torch.nn.Embedding๋ฅผ ์ฐธ๊ณ ํ•˜์„ธ์š” dropout_p (float): ๋“œ๋กญ์•„์›ƒ์œผ๋กœ ํ™œ์„ฑํ™” ์ถœ๋ ฅ์„ 0์œผ๋กœ ๋งŒ๋“ค ํ™•๋ฅ  """ super(SurnameGenerationModel, self).__init__() self.char_emb = nn.Embedding(num_embeddings=char_vocab_size, embedding_dim=char_embedding_size, padding_idx=padding_idx) self.rnn = nn.GRU(input_size=char_embedding_size, hidden_size=rnn_hidden_size, batch_first=batch_first) self.fc = nn.Linear(in_features=rnn_hidden_size, out_features=char_vocab_size) self._dropout_p = dropout_p def forward(self, x_in, apply_softmax=False): """๋ชจ๋ธ์˜ ์ •๋ฐฉํ–ฅ ๊ณ„์‚ฐ ๋งค๊ฐœ๋ณ€์ˆ˜: x_in (torch.Tensor): ์ž…๋ ฅ ๋ฐ์ดํ„ฐ ํ…์„œ x_in.shape๋Š” (batch, input_dim)์ž…๋‹ˆ๋‹ค. apply_softmax (bool): ์†Œํ”„ํŠธ๋งฅ์Šค ํ™œ์„ฑํ™”๋ฅผ ์œ„ํ•œ ํ”Œ๋ž˜๊ทธ๋กœ ํ›ˆ๋ จ์‹œ์—๋Š” False๊ฐ€ ๋˜์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. ๋ฐ˜ํ™˜๊ฐ’: ๊ฒฐ๊ณผ ํ…์„œ. tensor.shape๋Š” (batch, char_vocab_size)์ž…๋‹ˆ๋‹ค. """ x_embedded = self.char_emb(x_in) y_out, _ = self.rnn(x_embedded) batch_size, seq_size, feat_size = y_out.shape y_out = y_out.contiguous().view(batch_size * seq_size, feat_size) y_out = self.fc(F.dropout(y_out, p=self._dropout_p)) if apply_softmax: y_out = F.softmax(y_out, dim=1) new_feat_size = y_out.shape[-1] y_out = y_out.view(batch_size, seq_size, new_feat_size) return y_out
ย 

4. ๋ชจ๋ธ 2) ์กฐ๊ฑด ์žˆ๋Š” SurnameGenerationModel

์ด๋ฒˆ์—๋Š” ์„ฑ์”จ๋ฅผ ์ƒ์„ฑํ•˜๋Š” ๊ณผ์ •์—์„œ ๊ตญ์ ์„ ๊ณ ๋ คํ•˜๋„๋ก ๋ชจ๋ธ์„ ํ˜•์„ฑํ•ฉ๋‹ˆ๋‹ค. ์ฆ‰, ์€๋‹‰ ์ƒํƒœ์— ๊ตญ์ ์„ ์ž„๋ฒ ๋”ฉํ•˜์—ฌ RNN์˜ ์ดˆ๊ธฐ ์€๋‹‰ ์ƒํƒœ๋ฅผ ๋งŒ๋“ค์–ด์ฃผ์–ด, ์„ฑ์”จ์™€ ๊ตญ์  ์‚ฌ์ด์˜ ๊ทœ์น™์— ์กฐ๊ธˆ ๋” ๋ฏผ๊ฐํ•˜๊ฒŒ ๋ฐ˜์‘ํ•˜๋„๋ก ๋งŒ๋“ค์–ด์ค๋‹ˆ๋‹ค.
ย 
์•„๋ž˜์˜ ์ฝ”๋“œ์™€ ์œ„ ์กฐ๊ฑด ์—†๋Š” ๋ชจ๋ธ์˜ ์ฝ”๋“œ์˜ ๋‹ค๋ฅธ ์ ์€ ๊ตญ์  ์ธ๋ฑ์Šค๋ฅผ ๋งคํ•‘ํ•˜๋Š” ์ž„๋ฒ ๋”ฉ ์ธต nation_emb์ด ์ถ”๊ฐ€๋˜์—ˆ๋‹ค๋Š” ์ ์ž…๋‹ˆ๋‹ค. ๊ตญ์  ์ธ๋ฑ์Šค๋ฅผ RNN์˜ ์€๋‹‰ ์ธต๊ณผ ๊ฐ™์€ ํฌ๊ธฐ์˜ ๋ฒกํ„ฐ๋กœ ๋งคํ•‘ํ•˜๊ณ , forward ๊ณผ์ •์—์„œ RNN์˜ ์ดˆ๊ธฐ ์€๋‹‰ ์ƒํƒœ๋กœ ์ „๋‹ฌ๋ฉ๋‹ˆ๋‹ค. ์•„๋ž˜ ์†Œ์Šค์ฝ”๋“œ์—์„œ๋Š” ๋“ค์—ฌ์“ฐ๊ธฐ ์—†๋Š” ์ฃผ์„์œผ๋กœ ์ถ”๊ฐ€๋œ ๋ถ€๋ถ„์„ ํ‘œ์‹œํ•ด๋‘์—ˆ์Šต๋‹ˆ๋‹ค.
ย 
class SurnameGenerationModel(nn.Module): def __init__(self, char_embedding_size, char_vocab_size, num_nationalities, rnn_hidden_size, batch_first=True, padding_idx=0, dropout_p=0.5): """ ๋งค๊ฐœ๋ณ€์ˆ˜: char_embedding_size (int): ๋ฌธ์ž ์ž„๋ฒ ๋”ฉ ํฌ๊ธฐ char_vocab_size (int): ์ž„๋ฒ ๋”ฉ๋  ๋ฌธ์ž ๊ฐœ์ˆ˜ rnn_hidden_size (int): RNN์˜ ์€๋‹‰ ์ƒํƒœ ํฌ๊ธฐ batch_first (bool): 0๋ฒˆ์งธ ์ฐจ์›์ด ๋ฐฐ์น˜์ธ์ง€ ์‹œํ€€์Šค์ธ์ง€ ๋‚˜ํƒ€๋‚ด๋Š” ํ”Œ๋ž˜๊ทธ padding_idx (int): ํ…์„œ ํŒจ๋”ฉ์„ ์œ„ํ•œ ์ธ๋ฑ์Šค; torch.nn.Embedding๋ฅผ ์ฐธ๊ณ ํ•˜์„ธ์š” dropout_p (float): ๋“œ๋กญ์•„์›ƒ์œผ๋กœ ํ™œ์„ฑํ™” ์ถœ๋ ฅ์„ 0์œผ๋กœ ๋งŒ๋“ค ํ™•๋ฅ  """ super(SurnameGenerationModel, self).__init__() self.char_emb = nn.Embedding(num_embeddings=char_vocab_size, embedding_dim=char_embedding_size, padding_idx=padding_idx) # ๊ตญ์  ์ž„๋ฒ ๋”ฉ ์ธต ์ถ”๊ฐ€ self.nation_emb = nn.Embedding(num_embeddings=num_nationalities, embedding_dim=rnn_hidden_size) self.rnn = nn.GRU(input_size=char_embedding_size, hidden_size=rnn_hidden_size, batch_first=batch_first) self.fc = nn.Linear(in_features=rnn_hidden_size, out_features=char_vocab_size) self._dropout_p = dropout_p def forward(self, x_in, nationality_index, apply_softmax=False): """๋ชจ๋ธ์˜ ์ •๋ฐฉํ–ฅ ๊ณ„์‚ฐ ๋งค๊ฐœ๋ณ€์ˆ˜: x_in (torch.Tensor): ์ž…๋ ฅ ๋ฐ์ดํ„ฐ ํ…์„œ x_in.shape๋Š” (batch, max_seq_size)์ž…๋‹ˆ๋‹ค. nationality_index (torch.Tensor): ๊ฐ ๋ฐ์ดํ„ฐ ํฌ์ธํŠธ๋ฅผ ์œ„ํ•œ ๊ตญ์  ์ธ๋ฑ์Šค RNN์˜ ์€๋‹‰ ์ƒํƒœ๋ฅผ ์ดˆ๊ธฐํ™”ํ•˜๋Š”๋ฐ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค. apply_softmax (bool): ์†Œํ”„ํŠธ๋งฅ์Šค ํ™œ์„ฑํ™”๋ฅผ ์œ„ํ•œ ํ”Œ๋ž˜๊ทธ๋กœ ํ›ˆ๋ จ์‹œ์—๋Š” False๊ฐ€ ๋˜์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. ๋ฐ˜ํ™˜๊ฐ’: ๊ฒฐ๊ณผ ํ…์„œ. tensor.shape๋Š” (batch, char_vocab_size)์ž…๋‹ˆ๋‹ค. """ x_embedded = self.char_emb(x_in) # ๊ตญ์  ์ž„๋ฒ ๋”ฉ ์ธต ์ถ”๊ฐ€ # hidden_size: (num_layers * num_directions, batch_size, rnn_hidden_size) nationality_embedded = self.nation_emb(nationality_index).unsqueeze(0) y_out, _ = self.rnn(x_embedded, nationality_embedded) batch_size, seq_size, feat_size = y_out.shape y_out = y_out.contiguous().view(batch_size * seq_size, feat_size) y_out = self.fc(F.dropout(y_out, p=self._dropout_p)) if apply_softmax: y_out = F.softmax(y_out, dim=1) new_feat_size = y_out.shape[-1] y_out = y_out.view(batch_size, seq_size, new_feat_size) return y_out
ย 
ย 
ย 

๋ชจ๋ธ ํ›ˆ๋ จ๊ณผ ๊ฒฐ๊ณผ

์‹œํ€€์Šค์˜ ํƒ€์ž„ ์Šคํ…๋งˆ๋‹ค ์˜ˆ์ธก์„ ๋งŒ๋“ค๊ธฐ ๋•Œ๋ฌธ์— ์†์‹ค์„ ๊ณ„์‚ฐํ•˜๊ธฐ ์œ„ํ•ด์„œ๋Š” ์ด์ „ ์˜ˆ์ œ์—์„œ ๋‘ ๊ฐ€์ง€ ๋ณ€๊ฒฝํ•ด์•ผํ•  ๊ฒƒ๋“ค์ด ์žˆ์Šต๋‹ˆ๋‹ค. ์ฒซ์งธ๋กœ ๊ณ„์‚ฐ์„ ์œ„ํ•ด 3์ฐจ์› ํ…์„œ๋ฅผ 2์ฐจ์› ํ…์„œ์ธ ํ–‰๋ ฌ๋กœ ๋ณ€ํ™˜์‹œ์ผœ์•ผ ํ•ฉ๋‹ˆ๋‹ค. ๋‘๋ฒˆ์งธ๋กœ ๊ฐ€๋ณ€ ๊ธธ์ด ์‹œํ€€์Šค๋ฅผ ์œ„ํ•ด ๋งˆ์Šคํ‚น ์ธ๋ฑ์Šค๋ฅผ ์ค€๋น„ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. ๋งˆ์Šคํ‚น๋œ ์œ„์น˜์—์„œ๋Š” ์†์‹ค์„ ๊ณ„์‚ฐํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.
ย 
์•„๋ž˜ ์ฝ”๋“œ๋ฅผ ์ด์šฉํ•ด 3์ฐจ์› ํ…์„œ์™€ ๊ฐ€๋ณ€ ๊ธธ์ด ์‹œํ€€์Šค ์ด์Šˆ๋ฅผ ๋‹ค๋ฃน๋‹ˆ๋‹ค. ์˜ˆ์ธก๊ณผ ํƒ€๊นƒ์„ ์†์‹ค ํ•จ์ˆ˜๊ฐ€ ๊ธฐ๋Œ€ํ•˜๋Š” ํฌ๊ธฐ(์˜ˆ์ธก 2์ฐจ์›, ํƒ€๊นƒ 1์ฐจ์›)๋กœ ์ •๊ทœํ™”ํ•˜๋ฉด, ๊ฐ ํ–‰์€ ํ•˜๋‚˜์˜ ์ƒ˜ํ”Œ, ์ฆ‰ ์‹œํ€€์Šค์— ์žˆ๋Š” ํ•˜๋‚˜์˜ ํƒ€์ž„ ํ…์„ ๋‚˜ํƒ€๋‚ด๊ฒŒ๋ฉ๋‹ˆ๋‹ค. ๋‹ค์Œ์œผ๋กœ ignore_index๋ฅผ mask_index๋กœ ์ง€์ •ํ•˜์—ฌ ํฌ๋กœ์Šค ์—”ํŠธ๋กœํ”ผ ์†์‹ค์„ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค. ์ด๋Š” ์†์‹คํ•จ์ˆ˜๊ฐ€ ํƒ€๊นƒ์—์„œ ๋งˆ์Šคํ‚น๋œ ์ธ๋ฑ์Šค์˜ ์œ„์น˜๋ฅผ ๋ฌด์‹œํ•˜๋„๋ก ํ•ฉ๋‹ˆ๋‹ค.
ย 
def normalize_sizes(y_pred, y_true): """ํ…์„œ ํฌ๊ธฐ ์ •๊ทœํ™” ๋งค๊ฐœ๋ณ€์ˆ˜: y_pred (torch.Tensor): ๋ชจ๋ธ์˜ ์ถœ๋ ฅ 3์ฐจ์› ํ…์„œ์ด๋ฉด ํ–‰๋ ฌ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค. y_true (torch.Tensor): ํƒ€๊นƒ ์˜ˆ์ธก ํ–‰๋ ฌ์ด๋ฉด ๋ฒกํ„ฐ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค. """ if len(y_pred.size()) == 3: y_pred = y_pred.contiguous().view(-1, y_pred.size(2)) if len(y_true.size()) == 2: y_true = y_true.contiguous().view(-1) return y_pred, y_true
ย 
def sequence_loss(y_pred, y_true, mask_index): y_pred, y_true = normalize_sizes(y_pred, y_true) return F.cross_entropy(y_pred, y_true, ignore_index=mask_index)
ย 
๋ชจ๋ธ ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ๋Š” ๋Œ€๋ถ€๋ถ„ ๋ฌธ์ž ์–ดํœ˜ ์‚ฌ์ „ ํฌ๊ธฐ์— ๋”ฐ๋ผ ๊ฒฐ์ •๋ฉ๋‹ˆ๋‹ค. ์ด๋•Œ ํฌ๊ธฐ๋Š” ๋ชจ๋ธ ์ž…๋ ฅ์— ๋‚˜ํƒ€๋‚˜๋Š” ์ด์‚ฐ์ ์ธ ํ† ํฐ์˜ ๊ฐœ์ˆ˜์ด๊ณ  ํƒ€์ž„ ์Šคํ…๋งˆ๋‹ค ์ถœ๋ ฅ์— ๋‚˜ํƒ€๋‚˜๋Š” ํด๋ž˜์Šค ๊ฐœ์ˆ˜์ž…๋‹ˆ๋‹ค. ๊ทธ ์™ธ ๋ชจ๋ธ์— ์‚ฌ์šฉ๋˜๋Š” ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ๋Š” ๋ฌธ์ž ์ž„๋ฒ ๋”ฉ ํฌ๊ธฐ์™€ RNN ์€๋‹‰ ์ƒํƒœ ํฌ๊ธฐ์ž…๋‹ˆ๋‹ค. ๋‹ค์Œ ์ฝ”๋“œ์—์„œ ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ์™€ ํ›ˆ๋ จ ์„ค์ •์„ ๋ณด๊ฒ ์Šต๋‹ˆ๋‹ค.
ย 
args = Namespace( # ๋‚ ์งœ์™€ ๊ฒฝ๋กœ ์ •๋ณด surname_csv="data/surnames/surnames_with_splits.csv", vectorizer_file="vectorizer.json", model_state_file="model.pth", save_dir="model_storage/ch7/model2_conditioned_surname_generation", # ๋ชจ๋ธ ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ char_embedding_size=32, rnn_hidden_size=32, # ํ›ˆ๋ จ ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ seed=1337, learning_rate=0.001, batch_size=128, num_epochs=100, early_stopping_criteria=5, # ์‹คํ–‰ ์˜ต์…˜ catch_keyboard_interrupt=True, cuda=True, expand_filepaths_to_save_dir=True, reload_from_files=False, )
ย 
๋‹ค์Œ ์ฝ”๋“œ์—์„œ๋Š” forward() ๋ฉ”์„œ๋“œ์˜ ๋‹จ๊ณ„๋ฅผ ์ˆ˜์ •ํ•ด์„œ ์ƒˆ๋กœ์šด ๋ฐ˜๋ณต๋ฌธ์„ ๋งŒ๋“ญ๋‹ˆ๋‹ค. ์—ฌ๊ธฐ์—์„œ ํƒ€์ž„ ์Šคํ…๋งˆ๋‹ค ์˜ˆ์ธก์„ ๊ณ„์‚ฐํ•œ ๋’ค ๋‹ค์Œ ํƒ€์ž„ ์Šคํ…์˜ ์ž…๋ ฅ์œผ๋กœ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค. ์ด๋Š” ๋ชจ๋ธ์ด ์–ด๋–ค ์„ฑ์”จ๋ฅผ ์ƒ์„ฑํ–ˆ๋Š”์ง€ ์กฐ์‚ฌํ•˜์—ฌ ์งˆ์ ์œผ๋กœ ํ‰๊ฐ€ํ•˜๊ธฐ ์œ„ํ•จ์ž…๋‹ˆ๋‹ค. ๋ชจ๋ธ์€ ํƒ€์ž„ ์Šคํ…๋งˆ๋‹ค ์†Œํ”„ํŠธ๋งฅ์Šค ํ•จ์ˆ˜๋ฅผ ์‚ฌ์šฉํ•ด ํ™•๋ฅ  ๋ถ„ํฌ๋กœ ๋ณ€ํ™˜๋œ ์˜ˆ์ธก ๋ฒกํ„ฐ๋ฅผ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค. ํ™•๋ฅ  ๋ถ„ํฌ๋ฅผ ์‚ฌ์šฉํ•˜๋ฉด torch.multinomial() ์ƒ˜ํ”Œ๋ง ํ•จ์ˆ˜๋ฅผ ์ด์šฉํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ์ด ํ•จ์ˆ˜๋Š” ์ธ๋ฑ์Šค ํ™•๋ฅ ์— ๋น„๋ก€ํ•˜์—ฌ ์ธ๋ฑ์Šค๋ฅผ ์„ ํƒํ•˜๋ฉฐ, ์ƒ˜ํ”Œ๋ง์€ ๋งค๋ฒˆ ๋‹ค๋ฅธ ์ถœ๋ ฅ์„ ๋งŒ๋“œ๋Š” ๋žœ๋คํ•œ ๊ณผ์ •์ž…๋‹ˆ๋‹ค.
ย 
def sample_from_model(model, vectorizer, num_samples=1, sample_size=20, temperature=1.0): """๋ชจ๋ธ์ด ๋งŒ๋“  ์ธ๋ฑ์Šค ์‹œํ€€์Šค๋ฅผ ์ƒ˜ํ”Œ๋งํ•ฉ๋‹ˆ๋‹ค. ๋งค๊ฐœ๋ณ€์ˆ˜: model (SurnameGenerationModel): ํ›ˆ๋ จ ๋ชจ๋ธ vectorizer (SurnameVectorizer): SurnameVectorizer ๊ฐ์ฒด nationalities (list): ๊ตญ์ ์„ ๋‚˜ํƒ€๋‚ด๋Š” ์ •์ˆ˜ ๋ฆฌ์ŠคํŠธ sample_size (int): ์ƒ˜ํ”Œ์˜ ์ตœ๋Œ€ ๊ธธ์ด temperature (float): ๋ฌด์ž‘์œ„์„ฑ ์ •๋„ 0.0 < temperature < 1.0 ์ด๋ฉด ์ตœ๋Œ€ ๊ฐ’์„ ์„ ํƒํ•  ๊ฐ€๋Šฅ์„ฑ์ด ๋†’์Šต๋‹ˆ๋‹ค temperature > 1.0 ์ด๋ฉด ๊ท ๋“ฑ ๋ถ„ํฌ์— ๊ฐ€๊น์Šต๋‹ˆ๋‹ค ๋ฐ˜ํ™˜๊ฐ’: indices (torch.Tensor): ์ธ๋ฑ์Šค ํ–‰๋ ฌ shape = (num_samples, sample_size) """ begin_seq_index = [vectorizer.char_vocab.begin_seq_index for _ in range(num_samples)] begin_seq_index = torch.tensor(begin_seq_index, dtype=torch.int64).unsqueeze(dim=1) indices = [begin_seq_index] for time_step in range(sample_size): x_t = indices[time_step] x_emb_t = model.char_emb(x_t) rnn_out_t, h_t = model.rnn(x_emb_t, h_t) prediction_vector = model.fc(rnn_out_t.squeeze(dim=1)) probability_vector = F.softmax(prediction_vector / temperature, dim=1) indices.append(torch.multinomial(probability_vector, num_samples=1)) indices = torch.stack(indices).squeeze().permute(1, 0) return indices
ย 
๋‹ค์Œ ์ฝ”๋“œ์—์„œ๋Š” sample_from_model() ํ•จ์ˆ˜์—์„œ ์–ป์€ ์ƒ˜ํ”Œ๋ง ์ธ๋ฑ์Šค๋ฅผ ์‚ฌ๋žŒ์ด ์ฝ์„ ์ˆ˜ ์žˆ๋Š” ๋ฌธ์ž์—ด๋กœ ๋ฐ”๊พธ๊ธฐ ์œ„ํ•ด์„œ ์„ฑ์”จ๋ฅผ ๋ฒกํ„ฐํ™”ํ•˜๋Š” SequenceVocabulary๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค. ๋ฌธ์ž์—ด์„ ๋งŒ๋“ค ๋•Œ๋Š” END-OF-SEQUENCE ์ธ๋ฑ์Šค๊นŒ์ง€๋งŒ ์ธ๋ฑ์Šค๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค. ๋ชจ๋ธ์ด ์„ฑ์”จ๋ฅผ ์ข…๋ฃŒํ•  ๋•Œ๋ฅผ ํ•™์Šตํ–ˆ๋‹ค๊ณ  ๊ฐ€์ •ํ•˜๊ธฐ ๋–„๋ฌธ์ž…๋‹ˆ๋‹ค.
ย 
def decode_samples(sampled_indices, vectorizer): """์ธ๋ฑ์Šค๋ฅผ ์„ฑ์”จ ๋ฌธ์ž์—ด๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค ๋งค๊ฐœ๋ณ€์ˆ˜: sampled_indices (torch.Tensor): `sample_from_model` ํ•จ์ˆ˜์—์„œ ์–ป์€ ์ธ๋ฑ์Šค vectorizer (SurnameVectorizer): SurnameVectorizer ๊ฐ์ฒด """ decoded_surnames = [] vocab = vectorizer.char_vocab for sample_index in range(sampled_indices.shape[0]): surname = "" for time_step in range(sampled_indices.shape[1]): sample_item = sampled_indices[sample_index, time_step].item() if sample_item == vocab.begin_seq_index: continue elif sample_item == vocab.end_seq_index: break else: surname += vocab.lookup_index(sample_item) decoded_surnames.append(surname) return decoded_surnames
ย 
๋‹ค์Œ์œผ๋กœ ์กฐ๊ฑด์ด ์žˆ๋Š” SurnameGenerationModel์„ ์œ„ํ•ด sample_from_model() ํ•จ์ˆ˜๋ฅผ ์ˆ˜์ •ํ•˜์—ฌ ์ƒ˜ํ”Œ ๊ฐœ์ˆ˜ ๋Œ€์‹ ์— ๊ตญ์  ์ธ๋ฑ์Šค์˜ ๋ฆฌ์ŠคํŠธ๋ฅผ ๋ฐ›์Šต๋‹ˆ๋‹ค. ์ด ํ•จ์ˆ˜๋Š” ๊ตญ์  ์ธ๋ฑ์Šค๋ฅผ ์ž„๋ฒ ๋”ฉ์œผ๋กœ ๋ฐ”๊พธ์–ด GRU์˜ ์ดˆ๊ธฐ ์€๋‹‰ ์ƒํƒœ๋กœ ์‚ฌ์šฉํ•˜๊ฒŒ ๋ฉ๋‹ˆ๋‹ค.
ย 
def sample_from_model(model, vectorizer, nationalities, sample_size=20, temperature=1.0): """๋ชจ๋ธ์ด ๋งŒ๋“  ์ธ๋ฑ์Šค ์‹œํ€€์Šค๋ฅผ ์ƒ˜ํ”Œ๋งํ•ฉ๋‹ˆ๋‹ค. ๋งค๊ฐœ๋ณ€์ˆ˜: model (SurnameGenerationModel): ํ›ˆ๋ จ ๋ชจ๋ธ vectorizer (SurnameVectorizer): SurnameVectorizer ๊ฐ์ฒด nationalities (list): ๊ตญ์ ์„ ๋‚˜ํƒ€๋‚ด๋Š” ์ •์ˆ˜ ๋ฆฌ์ŠคํŠธ sample_size (int): ์ƒ˜ํ”Œ์˜ ์ตœ๋Œ€ ๊ธธ์ด temperature (float): ๋ฌด์ž‘์œ„์„ฑ ์ •๋„ 0.0 < temperature < 1.0 ์ด๋ฉด ์ตœ๋Œ€ ๊ฐ’์„ ์„ ํƒํ•  ๊ฐ€๋Šฅ์„ฑ์ด ๋†’์Šต๋‹ˆ๋‹ค temperature > 1.0 ์ด๋ฉด ๊ท ๋“ฑ ๋ถ„ํฌ์— ๊ฐ€๊น์Šต๋‹ˆ๋‹ค ๋ฐ˜ํ™˜๊ฐ’: indices (torch.Tensor): ์ธ๋ฑ์Šค ํ–‰๋ ฌ shape = (num_samples, sample_size) """ num_samples = len(nationalities) begin_seq_index = [vectorizer.char_vocab.begin_seq_index for _ in range(num_samples)] begin_seq_index = torch.tensor(begin_seq_index, dtype=torch.int64).unsqueeze(dim=1) indices = [begin_seq_index] nationality_indices = torch.tensor(nationalities, dtype=torch.int64).unsqueeze(dim=0) h_t = model.nation_emb(nationality_indices) for time_step in range(sample_size): x_t = indices[time_step] x_emb_t = model.char_emb(x_t) rnn_out_t, h_t = model.rnn(x_emb_t, h_t) prediction_vector = model.fc(rnn_out_t.squeeze(dim=1)) probability_vector = F.softmax(prediction_vector / temperature, dim=1) indices.append(torch.multinomial(probability_vector, num_samples=1)) indices = torch.stack(indices).squeeze().permute(1, 0) return indices
ย 
์ด์ œ ๊ตญ์  ์ธ๋ฑ์Šค๋ฅผ ์ˆœํšŒํ•˜๋ฉด์„œ ๊ฐ ๊ตญ์ ์—์„œ ์ƒ˜ํ”Œ๋ง์„ ์ˆ˜ํ–‰ํ•ฉ๋‹ˆ๋‹ค. ์ถœ๋ ฅ์„ ๋ณด๋ฉด ๋ชจ๋ธ์ด ์„ฑ์”จ ์ฒ ์ž์— ์žˆ๋Š” ์–ด๋–ค ํŒจํ„ด์„ ๋”ฐ๋ฆ„์„ ์•Œ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
ย 
model = model.cpu() for index in range(len(vectorizer.nationality_vocab)): nationality = vectorizer.nationality_vocab.lookup_index(index) print("{} ์ƒ˜ํ”Œ: ".format(nationality)) sampled_indices = sample_from_model(model, vectorizer, nationalities=[index] * 3, temperature=0.7) for sampled_surname in decode_samples(sampled_indices, vectorizer): print("- " + sampled_surname)
Arabic ์ƒ˜ํ”Œ: - Bakin - Heran - Soib Chinese ์ƒ˜ํ”Œ: - Luag - Rur - Dao Czech ์ƒ˜ํ”Œ: - Ponnoir - Stonaj - Teutche Dutch ์ƒ˜ํ”Œ: - Fmitzim - Fablelb - Ulskomov English ์ƒ˜ํ”Œ: - Cintee - Hillen - Vannid .....
ย 
ย 
์ด์ „ ๊ธ€ ์ฝ๊ธฐ