首页
首页

2019大数据挑战赛预赛

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69

from nltk.tokenize import WordPunctTokenizer
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

from sklearn.linear_model import SGDClassifier as SGD

import warnings;warnings.filterwarnings('ignore')

train = pd.read_csv("./data/train.csv", lineterminator='\n', header=0)
train['label'] = train['label'].map({'Negative':0, 'Positive': 1})


test = pd.read_csv("./data/20190527_test.csv", lineterminator='\n', header=0)


words = []
for _ in train['review'].values:
words.append(' '.join(WordPunctTokenizer().tokenize(_)))
train_data = words

train_label = np.array(train['label'].values, dtype='int8')

words = []
for _ in test['review'].values:
words.append(' '.join(WordPunctTokenizer().tokenize(_)))
test_data = words

ngram = 2
vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1, ngram), max_df=0.9)
corpus_all = train_data + test_data
vectorizer.fit(corpus_all)
corpus_all = vectorizer.transform(corpus_all)


lentrain = len(train_data)
train_data = corpus_all[:lentrain]
test_data = corpus_all[lentrain:]



folds = StratifiedKFold(n_splits=30, shuffle=False, random_state=2019)
predictions = np.zeros(test_data.shape[0])

aucs = []
for fold_, (train_index, test_index) in enumerate(folds.split(train_data, train_label)):
print("Fold :{}".format(fold_ + 1))
cv_train_data, cv_train_label= train_data[train_index], train_label[train_index]
cv_test_data, cv_test_label = train_data[test_index], train_label[test_index]


model = SGD(alpha=0.00001, penalty='l2', tol=10000, shuffle=True, loss='log')

model.fit(cv_train_data, cv_train_label)
auc = metrics.roc_auc_score(cv_test_label, model.predict_proba(cv_test_data)[:, 1])
predictions += model.predict_proba(test_data)[:, 1] / folds.n_splits


aucs.append(auc)
print("auc score: %.5f" % auc)

print('Mean auc', np.mean(aucs))
predictions = pd.DataFrame(predictions)
id = pd.DataFrame(np.arange(1, len(predictions) + 1))
data = pd.concat([id, predictions], axis=1)
data.to_csv('./data/merge_{}_predictions.csv'.format(np.mean(aucs)), header=['ID', 'Pred'], index=False)
支持一下
扫一扫,支持一下,爱你。
  • 微信扫一扫
  • 支付宝扫一扫