-- download from file server.
-- run command as DownloadExt.`` where
-- from="public/SogouCS.reduced.tar" and
-- to="/tmp/nlp/sogo";
-- or you can use command line:
-- !saveUploadFileToHome public/SogouCS.reduced.tar /tmp/nlp/sogo;
-- load data with xml format
load xml.`/tmp/nlp/sogo/news_sohusite_xml.dat` where rowTag="doc" and charset="GBK" as xmlData;
--extract `sports` from url[http://sports.sohu.com/20070422/n249599819.shtml]
select temp.* from (select split(split(url,"/")[2],"\\.")[0] as labelStr,content from xmlData) as temp
where temp.labelStr is not null
as rawData;
-- try to use the follow sql to explore how many label we have and how they looks like.
--
-- select distinct(split(split(url,"/")[2],"\\.")[0]) as labelStr from rawData as output;
-- select split(split(url,"/")[2],"\\.")[0] as labelStr,url from rawData as output;
-- train a model which can map label to number and vice versa
train rawData as StringIndex.`/tmp/nlp/label_mapping` where inputCol="labelStr"and
outputCol="label" ;
-- convert label to number
predict rawData as StringIndex.`/tmp/nlp/label_mapping` as rawDataWithLabel;
-- you can use register to convert a model to a functioin
register StringIndex.`/tmp/nlp/label_mapping` as convert_label;
-- we can reduce the dataset. Because if there are too much data but just get limited resource
-- it may take too long. you can use command line
-- or you can use raw ET:
--
-- run xmlData as RateSampler.``
-- where labelCol="url" and sampleRate="0.9,0.1"
-- as xmlDataArray;
!split rawDataWithLabel by label with "0.9,0.1" named xmlDataArray;
-- then we fetch the xmlDataArray with position one to get the 10% data.
select * from xmlDataArray where __split__=1 as miniXmlData;
-- we can save the result data, because it really take much time.
save overwrite miniXmlData as parquet.`/tmp/nlp/miniXmlData`;
load parquet.`/tmp/nlp/miniXmlData` as miniXmlData;
-- select * from miniXmlData limit 10 as output;
--convert the content to tfidf format
train miniXmlData as TfIdfInPlace.`/tmp/nlp/tfidf` where inputCol="content" as trainData;
save overwrite trainData as parquet.`/tmp/nlp/trainData`;
load parquet.`/tmp/nlp/trainData` as trainData;
-- again register a model as a functioin
register TfIdfInPlace.`/tmp/nlp/tfidf` as tfidf_predict;
-- use algorithm RandomForest to train
train trainData as RandomForest.`/tmp/nlp/rf` where
keepVersion="true"
and fitParam.0.featuresCol="content"
and fitParam.0.inputLabel="labelCol"
and fitParam.0.maxDepth="4"
and fitParam.0.checkpointInterval="100"
;
-- register RF model as a functioin
register RandomForest.`/tmp/nlp/rf` as rf_predict;
-- end to end predict; you can also deploy this as a API service
select rf_predict(tfidf_predict("新聞不錯")) as predicted as output;
MLSQL NLP Example
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.