diff --git a/python/neural_bag_of_words.py b/python/neural_bag_of_words.py index e69de29..2744d24 100644 --- a/python/neural_bag_of_words.py +++ b/python/neural_bag_of_words.py @@ -0,0 +1,38 @@ +import collections + +import datasets +import matplotlib.pyplot as plt +import numpy as np +import torch +import torch.nn as nn +import torch.optim as optim +import nltk +nltk.download('punkt') +nltk.download('averaged_perceptron_tagger') +import tqdm + +seed = 1234 +validate_size = 0.25 + +np.random.seed(seed) +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +torch.backends.cudnn.deterministic = True + +train_data, test_data = datasets.load_dataset("imdb", split=["train", "test"]) + +print(nltk.pos_tag(nltk.word_tokenize("Hello there you stupid fucking whore mr parker"))) + +def tokenize(input): + return {"tokens": nltk.word_tokenize(input["text"])} + +train_data = train_data.map(tokenize) +test_data = test_data.map(tokenize) + +train_valid_data = train_data.train_test_split(test_size=validate_size) +train_data = train_valid_data["train"] +valid_data = train_valid_data["test"] + +print(train_data) +print(test_data) +print(valid_data) \ No newline at end of file