Combine multiple feature using CountVectorizer in python
Case : I need to run NLP on title, description and product_url column and combine then for prediction.Using FeatureUnion
To do so we can pipeline the steps of field extraction and conversion throughfrom sklearn.pipeline import FeatureUnion, Pipeline
FeatureUnion helps you to combine multiple features together and pipeline will help you to pipeline the steps.
from sklearn.pipeline import FeatureUnion, Pipeline
X_train, X_test, y_train, y_test = train_test_split(data_subset, y, random_state = 50,test_size = 0.2)
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False)
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
transformer = [('FeatureUnion',FeatureUnion([
('title_tfidf',
Pipeline([('extract_field',
FunctionTransformer(lambda x: x['title'],
validate=False)),
('tfidf',
TfidfVectorizer())])),
('description_tfidf',
Pipeline([('extract_field',
FunctionTransformer(lambda x: x['description'],
validate=False)),
('tfidf',
TfidfVectorizer())])),
('producturl_tfidf',
Pipeline([('extract_field',
FunctionTransformer(lambda x: x['product_url'],
validate=False)),
('tfidf',
TfidfVectorizer())])),]))]
transformer.append(('rfc',RandomForestClassifier(n_estimators = 100)))
newPipeline = Pipeline(transformer)
newPipeline.fit(X_train.loc[:,['title','description','product_url']],y_train)
print(newPipeline.score(X_test.loc[:,['title','description','product_url']],y_test))