Combine multiple features after CountVectorizer and TFIDF

Combine multiple feature using CountVectorizer in python

Case : I need to run NLP on title, description and product_url column and combine then for prediction.

Using FeatureUnion

To do so we can pipeline the steps of field extraction and conversion through
from sklearn.pipeline import FeatureUnion, Pipeline

FeatureUnion helps you to combine multiple features together and pipeline will help you to pipeline the steps.

from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline

X_train, X_test, y_train, y_test = train_test_split(data_subset, y, random_state = 50,test_size = 0.2)

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False)

from sklearn.ensemble import RandomForestClassifier


forest = RandomForestClassifier(n_estimators = 100)

transformer = [('FeatureUnion',FeatureUnion([
                ('title_tfidf',
                  Pipeline([('extract_field',
                              FunctionTransformer(lambda x: x['title'],
                                                  validate=False)),
                            ('tfidf',
                              TfidfVectorizer())])),
                ('description_tfidf',
                  Pipeline([('extract_field',
                              FunctionTransformer(lambda x: x['description'],
                                                  validate=False)),
                            ('tfidf',
                              TfidfVectorizer())])),
                ('producturl_tfidf',
                  Pipeline([('extract_field',
                              FunctionTransformer(lambda x: x['product_url'],
                                                  validate=False)),
                            ('tfidf',
                              TfidfVectorizer())])),]))]
transformer.append(('rfc',RandomForestClassifier(n_estimators = 100)))
newPipeline = Pipeline(transformer)
newPipeline.fit(X_train.loc[:,['title','description','product_url']],y_train)
print(newPipeline.score(X_test.loc[:,['title','description','product_url']],y_test))

No comments:

Post a Comment

T-SQL LEAD LAG and SUM function based query

  Query on T-SQL window clause Below is the sales table Order_Date Name Product SubCategory ...