import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.cm as cm import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction import stop_words from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split
import string, re from time import time
导入数据
1 2 3 4 5 6 7 8 9
df = pd.read_csv("https://query.data.world/s/yd24ckbjzyp7h6zp7bacafpv2lgfkh", encoding="ISO-8859-1") display(df.shape) display(df["relevance"].value_counts()/df.shape[0]) ================================================================================= (8000, 15) no 0.821375 yes 0.177500 not sure 0.001125 Name: relevance, dtype: float64
stopwords = stop_words.ENGLISH_STOP_WORDS defclean(doc): #doc is a string of text doc = doc.replace("</br>", " ") #This text contains a lot of <br/> tags. doc = "".join([char for char in doc if char notin string.punctuation andnot char.isdigit()]) doc = " ".join([token for token in doc.split() if token notin stopwords]) #remove punctuation and numbers return doc
x = df.text y = df.relevance print(x.shape, y.shape) ========================================================================= (7991,) (7991,)
import warnings warnings.filterwarnings('ignore') import re, os, sys import numpy as np import pandas as pd
from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.utils import to_categorical from keras.layers import Dense, Input, GlobalMaxPooling1D,\ Conv1D, MaxPooling1D, Embedding, LSTM from keras.models import Model, Sequential from keras.initializers import Constant import tensorflow as tf from tensorflow import keras
defload_directory_data(directory): data = {} data["sentence"] = [] data["sentiment"] = [] for file_path in os.listdir(directory): with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f: data["sentence"].append(f.read()) data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1)) return pd.DataFrame.from_dict(data)
# Merge positive and negative examples, add a polarity column and shuffle. defload_dataset(directory): pos_df = load_directory_data(os.path.join(directory, "pos")) neg_df = load_directory_data(os.path.join(directory, "neg")) pos_df["polarity"] = 1 neg_df["polarity"] = 0 return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)
import numpy as np words = imdb.get_word_index() num2word = {} for w in words.keys(): num2word[words[w]] = w x_val_words = np.stack([np.array(list(map(lambda x: num2word.get(x, "NONE"), x_val[i]))) for i inrange(10)])