简易情感分析

我们来结合一下sklearn，pandas和刚讲过的工具库，来构建一个简易情感分析模型。

1 2	import numpy as np import pandas as pd

加载数据

1	data = pd.read_csv("./data/emotion_data.csv")

1	data.shape

(40000, 4)

1	data.head()

	tweet_id	sentiment	author	content
0	1956967341	empty	xoshayzers	@tiffanylue i know i was listenin to bad habi…
1	1956967666	sadness	wannamama	Layin n bed with a headache ughhhh…waitin o…
2	1956967696	sadness	coolfunky	Funeral ceremony…gloomy friday…
3	1956967789	enthusiasm	czareaquino	wants to hang out with friends SOON!
4	1956968416	neutral	xkilljoyx	@dannycastillo We want to trade with someone w…

1 2	# 不同的情感种类 data.sentiment.unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

数据预处理

1 2	# 去掉无关列 data = data.drop(data.columns[[0,2]], axis=1)

1	data.head()

	sentiment	content
0	empty	@tiffanylue i know i was listenin to bad habi…
1	sadness	Layin n bed with a headache ughhhh…waitin o…
2	sadness	Funeral ceremony…gloomy friday…
3	enthusiasm	wants to hang out with friends SOON!
4	neutral	@dannycastillo We want to trade with someone w…

1	dataset = data.as_matrix()

/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:1: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
  """Entry point for launching an IPython kernel.

1	dataset.shape

(40000, 2)

1	features = dataset[:,1]

1	features[123]

'@poinktoinkdoink He died.  Wait, what about Magic Jack? I just read it.'

1	target = dataset[:,0]

# 使用LabelEncoder对不同的情感target进行编码
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
target_processed = le.fit_transform(target)

1	le.classes_

array(['anger', 'boredom', 'empty', 'enthusiasm', 'fun', 'happiness',
       'hate', 'love', 'neutral', 'relief', 'sadness', 'surprise',
       'worry'], dtype=object)

# 对输入的文本进行特征抽取和表示(这里用到的tf-idf特征在后面的课程中会讲到)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X_processed = tfidf.fit_transform(features)

1	X_processed

<40000x48212 sparse matrix of type '<class 'numpy.float64'>'
    with 475946 stored elements in Compressed Sparse Row format>

1 2	from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_processed, target_processed, test_size=0.5, random_state=42)

y_train

array([ 3,  5, 10, ...,  4,  6,  7])

模型训练

1
2
3

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

1 2	# 模型评估 lr.score(X_test, y_test)

0.3489

# 模型预测
test_ex = ["It is so horrible"]
text_ex_processed = tfidf.transform(test_ex)
lr.predict(text_ex_processed)

array([12])

1
2