ITÊýÂë ¹ºÎï ÍøÖ· Í·Ìõ Èí¼þ ÈÕÀú ÔĶÁ ͼÊé¹Ý
TxTС˵ÔĶÁÆ÷
¡ýÓïÒôÔĶÁ,С˵ÏÂÔØ,¹ÅµäÎÄѧ¡ý
ͼƬÅúÁ¿ÏÂÔØÆ÷
¡ýÅúÁ¿ÏÂÔØͼƬ,ÃÀŮͼ¿â¡ý
ͼƬ×Ô¶¯²¥·ÅÆ÷
¡ýͼƬ×Ô¶¯²¥·ÅÆ÷¡ý
Ò»¼üÇå³ýÀ¬»ø
¡ýÇáÇáÒ»µã,Çå³ýϵͳÀ¬»ø¡ý
¿ª·¢: C++֪ʶ¿â Java֪ʶ¿â JavaScript Python PHP֪ʶ¿â È˹¤ÖÇÄÜ Çø¿éÁ´ ´óÊý¾Ý Òƶ¯¿ª·¢ ǶÈëʽ ¿ª·¢¹¤¾ß Êý¾Ý½á¹¹ÓëËã·¨ ¿ª·¢²âÊÔ ÓÎÏ·¿ª·¢ ÍøÂçЭÒé ϵͳÔËά
½Ì³Ì: HTML½Ì³Ì CSS½Ì³Ì JavaScript½Ì³Ì GoÓïÑÔ½Ì³Ì JQuery½Ì³Ì VUE½Ì³Ì VUE3½Ì³Ì Bootstrap½Ì³Ì SQLÊý¾Ý¿â½Ì³Ì CÓïÑÔ½Ì³Ì C++½Ì³Ì Java½Ì³Ì Python½Ì³Ì Python3½Ì³Ì C#½Ì³Ì
ÊýÂë: µçÄÔ ±Ê¼Ç±¾ ÏÔ¿¨ ÏÔʾÆ÷ ¹Ì̬ӲÅÌ Ó²ÅÌ ¶ú»ú ÊÖ»ú iphone vivo oppo СÃ× »ªÎª µ¥·´ ×°»ú ͼÀ­¶¡
 
   -> È˹¤ÖÇÄÜ -> ¡¾NLP¡¿Ñ¶·ÉÓ¢ÎÄѧÊõÂÛÎÄ·ÖÀàÌôÕ½ÈüTop10¿ªÔ´¶à·½°¸¨C4 »úÆ÷ѧϰLGB ·½°¸ -> ÕýÎÄÔĶÁ

[È˹¤ÖÇÄÜ]¡¾NLP¡¿Ñ¶·ÉÓ¢ÎÄѧÊõÂÛÎÄ·ÖÀàÌôÕ½ÈüTop10¿ªÔ´¶à·½°¸¨C4 »úÆ÷ѧϰLGB ·½°¸

1 Ïà¹ØÐÅÏ¢

2 ÒýÑÔ

(1)»úÆ÷ѧϰ·½·¨ÔÚÎÒÃǵÄÈÎÎñÖÐ,ûÓÐÖصãÑо¿,½öд³öÒ»¸ö¼òµ¥µÄbaseline¡£¸Ã²¿·ÖÓɶÓÓѲ©Ô¶ÊµÏÖ,±ÈÈü×îºóµÄÄ£ÐÍÈںϵIJ¿·Ö,ʹÓõ½,µ¥¸öLGBÄ£ÐÍ,ûÓе÷²Î,5ÕÛ½»²æÑéÖ¤,ÏßÉ϶¼ÄÜ´ïµ½0.79+µÄ³É¼¨,»¹Óкܶà¿ÉÌá·Öµã,Èç¹ûʱ¼äÔÊÐí,ÓкܶàÌá·Ö¼¼ÇÉ

  • XGBÄ£ÐÍ
  • LRÄ£ÐÍ
  • ¶àÄ£ÐÍÈÚºÏEnsembleVoteClassifier
  • ¶àÄ£ÐÍÈÚºÏStackingClassifier
  • ¼ÓÈëα±êǩѵÁ·
  • Êý¾ÝÔöÇ¿
    (2)ʵÏÖ²½Öè
  • Êý¾ÝÔ¤´¦Àí
  • KÕÛ»®·ÖÊý¾Ý
  • TF-IDFÌáÈ¡ÌØÕ÷,½«µ¥´ÊתΪÊýÖµ¾ØÕó
  • ѵÁ·Ä£ÐÍ
  • ÿÕÛÔ¤²âÒ»±é½á¹ûÇóºÍ,×îºóÈ¡KÕ۵Ľá¹ûµÄƽ¾ù×÷ΪԤ²â¾ØÕó
  • np.argmaxÈ¡µÃÔ¤²âÖµ
  • Éú³ÉÌá½»Îļþ

3 ʵÏÖ

import pandas as pd
from nltk.stem import WordNetLemmatizer
import re
import nltk
from spacy.lang.en.stop_words import STOP_WORDS
import numpy as np
import lightgbm as lgb
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import KFold, StratifiedKFold

from sklearn.model_selection import train_test_split

clean_tag =True
if clean_tag ==True:
    en_stop = set(nltk.corpus.stopwords.words('english'))
    custom_stop_words = [
        'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure',
        'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.',
        'al.', 'elsevier', 'pmc', 'czi', 'www'
    ]
    for word in custom_stop_words:
        en_stop.add(word)


    def preprocess_text(document):
        stemmer = WordNetLemmatizer()

        document = str(document)
        document = document.replace("\n", ' ')
        document = document.replace("/'", '')
        # Remove  all the special characters
        document = re.sub(r'\W', ' ', document)

        # ɾ³ýËùÓе¥¸ö×Ö·û
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # ´Ó¿ªÍ·É¾³ýµ¥¸ö×Ö·û
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Óõ¥¸ö¿Õ¸ñÌæ»»¶à¸ö¿Õ¸ñ
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Êý×Ö·º»¯:,ËùÓдóÓÚ9µÄÊý×Ö¶¼±»hashsÌæ»»ÁË¡£¼´³ÉΪ# #,123±ä³É# # #»ò15.80€±ä³É# #,# #€¡£
        document = re.sub('[0-9]{5,}', '#####', document)
        document = re.sub('[0-9]{4}', '####', document)
        document = re.sub('[0-9]{3}', '###', document)
        document = re.sub('[0-9]{2}', '##', document)
        # ת»»ÎªÐ¡Ð´
        document = document.lower()
        # ´ÊÐλ¹Ô­
        tokens = document.split()
        tokens = [stemmer.lemmatize(word) for word in tokens]
        # ȥͣÓôÊ
        tokens = [word for word in tokens if word not in en_stop]
        # È¥µÍƵ´Ê
        tokens = [word for word in tokens if len(word) > 3]
        preprocessed_text = ' '.join(tokens)

        return preprocessed_text

    train = pd.read_csv("train/train_stop.csv", sep="\t")
    test = pd.read_csv("test/test.csv", sep="\t")
    sub = pd.read_csv("sample_submit.csv")

    train["text"] = train["title"] + " " + train["abstract"]
    # for i in range(len(train["text"])):
    #     train["text"][i] = preprocess_text(train["text"][i])
    train["text"] = train["text"].progress_apply(lambda x: preprocess_text(x))
    train.to_csv('ml_clean_data.csv', sep='\t')
else:
    train = pd.read_csv('ml_clean_data.csv', sep='\t')
# ½¨Á¢Ó³Éä
label_id2cate = dict(enumerate(train.categories.unique()))
label_cate2id = {value: key for key, value in label_id2cate.items()}
train["label"] = train["categories"].map(label_cate2id)
df = train[["text", "label"]]
df.head()

# Éú³ÉÌá½»Îļþ
def submit_file(result_pred,label_id2cate):#result_predÊÇÔ¤²âµÄ½á¹û,Ó¦¸ÃÊÇ10000¸öÖµ
    print("´æ´¢Ô¤²â½á¹û")
    sub=pd.read_csv('./sample_submit.csv')# ¹ÙÍø¸ø³öµÄ¸ñʽÎļþ
    sub['categories']=list(result_pred)
    sub['categories']=sub['categories'].map(label_id2cate)
    sub.to_csv('submit/submit_{}_ensemble.csv'.format(models_name), index=False)

# 5ÕÛ½»²æÑéÖ¤

params = {
    "device_type": "gpu",
    "max_depth": 5,
    "min_data_in_leaf": 20,
    "num_leaves": 35,
    "learning_rate": 0.1,
    "lambda_l1": 0.1,
    "lambda_l2": 0.2,
    "objective": "multiclass",
    "num_class": 39,
    "verbose": 0,
}

train_data = df["text"]
train_label = df["label"]

NFOLDS = 5
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=1)
kf = kfold.split(train_data, train_label)
cv_pred = np.zeros(test.shape[0])
valid_best = 0

for i, (train_fold, validate) in enumerate(kf):

    #     X=train_data.reset_index(drop=True)
    #     y= train_label.reset_index(drop=True)
    X_train, X_validate, label_train, label_validate = (
        train_data.iloc[train_fold],
        train_data.iloc[validate],
        train_label[train_fold],
        train_label[validate],
    )

    # ½«ÓïÁÏת»¯Îª´Ê´üÏòÁ¿,¸ù¾Ý´Ê´üÏòÁ¿Í³¼ÆTF-IDF
    vectorizer = CountVectorizer(max_features=50000)
    tf_idf_transformer = TfidfTransformer()
    tf_idf = tf_idf_transformer.fit_transform(vectorizer.fit_transform(X_train))
    X_train_weight = tf_idf.toarray()  # ѵÁ·¼¯TF-IDFȨÖؾØÕó
    tf_idf = tf_idf_transformer.transform(vectorizer.transform(X_validate))
    X_validate_weight = tf_idf.toarray()  # ÑéÖ¤¼¯TF-IDFȨÖؾØÕó

    dtrain = lgb.Dataset(X_train_weight, label_train)
    dvalid = lgb.Dataset(X_validate_weight, label_validate, reference=dtrain)

    bst = lgb.train(
        params,
        dtrain,
        num_boost_round=10000,
        valid_sets=dvalid,
        early_stopping_rounds=500,
    )

    preds_last = bst.predict(test, num_iteration=bst.best_iteration)
    cv_pred += bst.predict(test, num_iteration=bst.best_iteration)
    valid_best += bst.best_score["valid_0"]["auc"]

cv_pred /= NFOLDS  # Ô¤²âÊä³ö
valid_best /= NFOLDS
result =np.argmax(cv_pred,axis=1) 
submit_file(list(result),label_id2cate)

  È˹¤ÖÇÄÜ ×îÐÂÎÄÕÂ
2022Îâ¶÷´ï»úÆ÷ѧϰ¿Î³Ì¡ª¡ªµÚ¶þ¿Î£¨Éñ¾­Íø
µÚÊ®ÎåÕ ¹æÔòѧϰ
FixMatch: Simplifying Semi-Supervised Le
Êý¾ÝÍÚ¾òJava¡ª¡ªKmeansËã·¨µÄʵÏÖ
´óÄÔƤ²ãµÄ·Ö¸î·½·¨
¡¾·­Òë¡¿GPT-3ÊÇÈçºÎ¹¤×÷µÄ
ÂÛÎıʼÇ:TEACHTEXT: CrossModal Generaliz
python´ÓÁãѧ£¨Áù£©
Ïê½âPython 3.x µ¼Èë(import)
¡¾´ð¶ÁÕßÎÊ27¡¿backtrader²»Ö§³Ö×îа汾µÄ
ÉÏһƪÎÄÕ      ÏÂһƪÎÄÕ      ²é¿´ËùÓÐÎÄÕÂ
¼Ó:2021-08-13 12:01:20  ¸ü:2021-08-13 12:01:54 
 
¿ª·¢: C++֪ʶ¿â Java֪ʶ¿â JavaScript Python PHP֪ʶ¿â È˹¤ÖÇÄÜ Çø¿éÁ´ ´óÊý¾Ý Òƶ¯¿ª·¢ ǶÈëʽ ¿ª·¢¹¤¾ß Êý¾Ý½á¹¹ÓëËã·¨ ¿ª·¢²âÊÔ ÓÎÏ·¿ª·¢ ÍøÂçЭÒé ϵͳÔËά
½Ì³Ì: HTML½Ì³Ì CSS½Ì³Ì JavaScript½Ì³Ì GoÓïÑÔ½Ì³Ì JQuery½Ì³Ì VUE½Ì³Ì VUE3½Ì³Ì Bootstrap½Ì³Ì SQLÊý¾Ý¿â½Ì³Ì CÓïÑÔ½Ì³Ì C++½Ì³Ì Java½Ì³Ì Python½Ì³Ì Python3½Ì³Ì C#½Ì³Ì
ÊýÂë: µçÄÔ ±Ê¼Ç±¾ ÏÔ¿¨ ÏÔʾÆ÷ ¹Ì̬ӲÅÌ Ó²ÅÌ ¶ú»ú ÊÖ»ú iphone vivo oppo СÃ× »ªÎª µ¥·´ ×°»ú ͼÀ­¶¡

360ͼÊé¹Ý ¹ºÎï Èý·á¿Æ¼¼ ÔĶÁÍø ÈÕÀú ÍòÄêÀú 2024Äê11ÈÕÀú -2024/11/27 21:01:53-

ͼƬ×Ô¶¯²¥·ÅÆ÷
¡ýͼƬ×Ô¶¯²¥·ÅÆ÷¡ý
TxTС˵ÔĶÁÆ÷
¡ýÓïÒôÔĶÁ,С˵ÏÂÔØ,¹ÅµäÎÄѧ¡ý
Ò»¼üÇå³ýÀ¬»ø
¡ýÇáÇáÒ»µã,Çå³ýϵͳÀ¬»ø¡ý
ͼƬÅúÁ¿ÏÂÔØÆ÷
¡ýÅúÁ¿ÏÂÔØͼƬ,ÃÀŮͼ¿â¡ý
  ÍøÕ¾ÁªÏµ: qq:121756557 email:121756557@qq.com  ITÊýÂë