처음으로 올리는 파일입니다.
import torch
torch.cuda.is_available()
True
from pycaret.datasets import get_data
dataset = get_data('juice')
| Id | Purchase | WeekofPurchase | StoreID | PriceCH | PriceMM | DiscCH | DiscMM | SpecialCH | SpecialMM | LoyalCH | SalePriceMM | SalePriceCH | PriceDiff | Store7 | PctDiscMM | PctDiscCH | ListPriceDiff | STORE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | CH | 237 | 1 | 1.75 | 1.99 | 0.00 | 0.0 | 0 | 0 | 0.500000 | 1.99 | 1.75 | 0.24 | No | 0.000000 | 0.000000 | 0.24 | 1 |
| 1 | 2 | CH | 239 | 1 | 1.75 | 1.99 | 0.00 | 0.3 | 0 | 1 | 0.600000 | 1.69 | 1.75 | -0.06 | No | 0.150754 | 0.000000 | 0.24 | 1 |
| 2 | 3 | CH | 245 | 1 | 1.86 | 2.09 | 0.17 | 0.0 | 0 | 0 | 0.680000 | 2.09 | 1.69 | 0.40 | No | 0.000000 | 0.091398 | 0.23 | 1 |
| 3 | 4 | MM | 227 | 1 | 1.69 | 1.69 | 0.00 | 0.0 | 0 | 0 | 0.400000 | 1.69 | 1.69 | 0.00 | No | 0.000000 | 0.000000 | 0.00 | 1 |
| 4 | 5 | CH | 228 | 7 | 1.69 | 1.69 | 0.00 | 0.0 | 0 | 0 | 0.956535 | 1.69 | 1.69 | 0.00 | Yes | 0.000000 | 0.000000 | 0.00 | 0 |
from pycaret.classification import *
setup_clf = setup(data=dataset, target='Purchase')
| Description | Value | |
|---|---|---|
| 0 | session_id | 288 |
| 1 | Target | Purchase |
| 2 | Target Type | Binary |
| 3 | Label Encoded | CH: 0, MM: 1 |
| 4 | Original Data | (1070, 19) |
| 5 | Missing Values | False |
| 6 | Numeric Features | 13 |
| 7 | Categorical Features | 5 |
| 8 | Ordinal Features | False |
| 9 | High Cardinality Features | False |
| 10 | High Cardinality Method | None |
| 11 | Transformed Train Set | (748, 17) |
| 12 | Transformed Test Set | (322, 17) |
| 13 | Shuffle Train-Test | True |
| 14 | Stratify Train-Test | False |
| 15 | Fold Generator | StratifiedKFold |
| 16 | Fold Number | 10 |
| 17 | CPU Jobs | -1 |
| 18 | Use GPU | False |
| 19 | Log Experiment | False |
| 20 | Experiment Name | clf-default-name |
| 21 | USI | ad07 |
| 22 | Imputation Type | simple |
| 23 | Iterative Imputation Iteration | None |
| 24 | Numeric Imputer | mean |
| 25 | Iterative Imputation Numeric Model | None |
| 26 | Categorical Imputer | constant |
| 27 | Iterative Imputation Categorical Model | None |
| 28 | Unknown Categoricals Handling | least_frequent |
| 29 | Normalize | False |
| 30 | Normalize Method | None |
| 31 | Transformation | False |
| 32 | Transformation Method | None |
| 33 | PCA | False |
| 34 | PCA Method | None |
| 35 | PCA Components | None |
| 36 | Ignore Low Variance | False |
| 37 | Combine Rare Levels | False |
| 38 | Rare Level Threshold | None |
| 39 | Numeric Binning | False |
| 40 | Remove Outliers | False |
| 41 | Outliers Threshold | None |
| 42 | Remove Multicollinearity | False |
| 43 | Multicollinearity Threshold | None |
| 44 | Remove Perfect Collinearity | True |
| 45 | Clustering | False |
| 46 | Clustering Iteration | None |
| 47 | Polynomial Features | False |
| 48 | Polynomial Degree | None |
| 49 | Trignometry Features | False |
| 50 | Polynomial Threshold | None |
| 51 | Group Features | False |
| 52 | Feature Selection | False |
| 53 | Feature Selection Method | classic |
| 54 | Features Selection Threshold | None |
| 55 | Feature Interaction | False |
| 56 | Feature Ratio | False |
| 57 | Interaction Threshold | None |
| 58 | Fix Imbalance | False |
| 59 | Fix Imbalance Method | SMOTE |
rf = create_model('rf',fold=5)
| Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
|---|---|---|---|---|---|---|---|
| Fold | |||||||
| 0 | 0.8000 | 0.8649 | 0.7018 | 0.7547 | 0.7273 | 0.5697 | 0.5706 |
| 1 | 0.7933 | 0.8453 | 0.7895 | 0.7031 | 0.7438 | 0.5716 | 0.5743 |
| 2 | 0.8000 | 0.8817 | 0.7018 | 0.7547 | 0.7273 | 0.5697 | 0.5706 |
| 3 | 0.7248 | 0.8169 | 0.6429 | 0.6316 | 0.6372 | 0.4156 | 0.4156 |
| 4 | 0.8054 | 0.8380 | 0.7679 | 0.7288 | 0.7478 | 0.5895 | 0.5901 |
| Mean | 0.7847 | 0.8494 | 0.7207 | 0.7146 | 0.7167 | 0.5432 | 0.5443 |
| Std | 0.0302 | 0.0223 | 0.0524 | 0.0457 | 0.0406 | 0.0643 | 0.0647 |
top5 = compare_models(sort='Accuracy', n_select=5)
| Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
|---|---|---|---|---|---|---|---|---|---|
| lr | Logistic Regression | 0.8155 | 0.8868 | 0.7277 | 0.7756 | 0.7491 | 0.6036 | 0.6061 | 0.1880 |
| ridge | Ridge Classifier | 0.8155 | 0.0000 | 0.7523 | 0.7627 | 0.7557 | 0.6077 | 0.6095 | 0.0070 |
| lda | Linear Discriminant Analysis | 0.8101 | 0.8876 | 0.7346 | 0.7596 | 0.7448 | 0.5940 | 0.5963 | 0.0060 |
| gbc | Gradient Boosting Classifier | 0.8007 | 0.8756 | 0.7203 | 0.7459 | 0.7301 | 0.5725 | 0.5755 | 0.0230 |
| ada | Ada Boost Classifier | 0.7968 | 0.8640 | 0.7060 | 0.7462 | 0.7228 | 0.5629 | 0.5661 | 0.0210 |
| rf | Random Forest Classifier | 0.7888 | 0.8553 | 0.7346 | 0.7189 | 0.7218 | 0.5523 | 0.5571 | 0.1100 |
| lightgbm | Light Gradient Boosting Machine | 0.7860 | 0.8620 | 0.7314 | 0.7149 | 0.7199 | 0.5473 | 0.5506 | 0.0160 |
| et | Extra Trees Classifier | 0.7821 | 0.8178 | 0.7347 | 0.7053 | 0.7147 | 0.5391 | 0.5447 | 0.1090 |
| dt | Decision Tree Classifier | 0.7540 | 0.7529 | 0.7031 | 0.6696 | 0.6821 | 0.4822 | 0.4863 | 0.0050 |
| nb | Naive Bayes | 0.7499 | 0.8248 | 0.7527 | 0.6475 | 0.6946 | 0.4854 | 0.4913 | 0.0060 |
| knn | K Neighbors Classifier | 0.7314 | 0.7596 | 0.5930 | 0.6646 | 0.6242 | 0.4166 | 0.4202 | 0.0280 |
| dummy | Dummy Classifier | 0.6217 | 0.5000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0050 |
| svm | SVM - Linear Kernel | 0.5240 | 0.0000 | 0.4000 | 0.1512 | 0.2194 | 0.0000 | 0.0000 | 0.0050 |
| qda | Quadratic Discriminant Analysis | 0.3717 | 0.4771 | 0.9034 | 0.3591 | 0.5110 | -0.0424 | -0.0494 | 0.0060 |
top5
[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=1000,
multi_class='auto', n_jobs=None, penalty='l2',
random_state=288, solver='lbfgs', tol=0.0001, verbose=0,
warm_start=False),
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
max_iter=None, normalize=False, random_state=288, solver='auto',
tol=0.001),
LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
solver='svd', store_covariance=False, tol=0.0001),
GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
learning_rate=0.1, loss='deviance', max_depth=3,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_iter_no_change=None, presort='deprecated',
random_state=288, subsample=1.0, tol=0.0001,
validation_fraction=0.1, verbose=0,
warm_start=False),
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
n_estimators=50, random_state=288)]
tuned_top5 = [tune_model(i) for i in top5]
| Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
|---|---|---|---|---|---|---|---|
| Fold | |||||||
| 0 | 0.8267 | 0.9003 | 0.7586 | 0.7857 | 0.7719 | 0.6322 | 0.6325 |
| 1 | 0.8667 | 0.9067 | 0.7241 | 0.9130 | 0.8077 | 0.7077 | 0.7189 |
| 2 | 0.8533 | 0.9153 | 0.8276 | 0.8000 | 0.8136 | 0.6927 | 0.6930 |
| 3 | 0.7867 | 0.8894 | 0.7500 | 0.7000 | 0.7241 | 0.5506 | 0.5514 |
| 4 | 0.8667 | 0.9362 | 0.8929 | 0.7812 | 0.8333 | 0.7230 | 0.7275 |
| 5 | 0.8400 | 0.9236 | 0.7500 | 0.8077 | 0.7778 | 0.6530 | 0.6541 |
| 6 | 0.7467 | 0.8469 | 0.5714 | 0.6957 | 0.6275 | 0.4383 | 0.4432 |
| 7 | 0.7600 | 0.8100 | 0.6429 | 0.6923 | 0.6667 | 0.4796 | 0.4804 |
| 8 | 0.8649 | 0.8769 | 0.7500 | 0.8750 | 0.8077 | 0.7045 | 0.7094 |
| 9 | 0.7568 | 0.8688 | 0.7143 | 0.6667 | 0.6897 | 0.4900 | 0.4908 |
| Mean | 0.8168 | 0.8874 | 0.7382 | 0.7717 | 0.7520 | 0.6072 | 0.6101 |
| Std | 0.0468 | 0.0362 | 0.0839 | 0.0783 | 0.0672 | 0.1024 | 0.1035 |
blender_top5 = blend_models(estimator_list=tuned_top5)
| Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
|---|---|---|---|---|---|---|---|
| Fold | |||||||
| 0 | 0.8667 | 0.0000 | 0.8621 | 0.8065 | 0.8333 | 0.7224 | 0.7235 |
| 1 | 0.8267 | 0.0000 | 0.7586 | 0.7857 | 0.7719 | 0.6322 | 0.6325 |
| 2 | 0.8667 | 0.0000 | 0.8621 | 0.8065 | 0.8333 | 0.7224 | 0.7235 |
| 3 | 0.7867 | 0.0000 | 0.7857 | 0.6875 | 0.7333 | 0.5569 | 0.5603 |
| 4 | 0.8800 | 0.0000 | 0.8929 | 0.8065 | 0.8475 | 0.7490 | 0.7516 |
| 5 | 0.8533 | 0.0000 | 0.7857 | 0.8148 | 0.8000 | 0.6843 | 0.6846 |
| 6 | 0.8000 | 0.0000 | 0.7143 | 0.7407 | 0.7273 | 0.5695 | 0.5697 |
| 7 | 0.7333 | 0.0000 | 0.6786 | 0.6333 | 0.6552 | 0.4382 | 0.4389 |
| 8 | 0.8514 | 0.0000 | 0.7143 | 0.8696 | 0.7843 | 0.6726 | 0.6801 |
| 9 | 0.7568 | 0.0000 | 0.7143 | 0.6667 | 0.6897 | 0.4900 | 0.4908 |
| Mean | 0.8221 | 0.0000 | 0.7768 | 0.7618 | 0.7676 | 0.6237 | 0.6256 |
| Std | 0.0480 | 0.0000 | 0.0706 | 0.0725 | 0.0615 | 0.1005 | 0.1010 |
final_model = finalize_model(blender_top5)
prediction = predict_model(final_model, data=dataset.iloc[-100:])
| Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
|---|---|---|---|---|---|---|---|---|
| 0 | Voting Classifier | 0 | 0.8000 | 0 | 0 | 0 | 0 | 0 |
prediction
| Id | Purchase | WeekofPurchase | StoreID | PriceCH | PriceMM | DiscCH | DiscMM | SpecialCH | SpecialMM | LoyalCH | SalePriceMM | SalePriceCH | PriceDiff | Store7 | PctDiscMM | PctDiscCH | ListPriceDiff | STORE | Label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 970 | 971 | MM | 240 | 1 | 1.75 | 1.99 | 0.0 | 0.30 | 0 | 1 | 0.224526 | 1.69 | 1.75 | -0.06 | No | 0.150754 | 0.000000 | 0.24 | 1 | MM |
| 971 | 972 | MM | 241 | 1 | 1.86 | 1.99 | 0.0 | 0.30 | 0 | 1 | 0.179621 | 1.69 | 1.86 | -0.17 | No | 0.150754 | 0.000000 | 0.13 | 1 | MM |
| 972 | 973 | CH | 242 | 1 | 1.86 | 1.99 | 0.0 | 0.30 | 0 | 1 | 0.143697 | 1.69 | 1.86 | -0.17 | No | 0.150754 | 0.000000 | 0.13 | 1 | MM |
| 973 | 974 | MM | 243 | 1 | 1.86 | 1.99 | 0.0 | 0.80 | 0 | 1 | 0.314957 | 1.19 | 1.86 | -0.67 | No | 0.402010 | 0.000000 | 0.13 | 1 | MM |
| 974 | 975 | MM | 244 | 1 | 1.86 | 2.09 | 0.0 | 0.00 | 0 | 0 | 0.251966 | 2.09 | 1.86 | 0.23 | No | 0.000000 | 0.000000 | 0.23 | 1 | MM |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1065 | 1066 | CH | 252 | 7 | 1.86 | 2.09 | 0.1 | 0.00 | 0 | 0 | 0.587822 | 2.09 | 1.76 | 0.33 | Yes | 0.000000 | 0.053763 | 0.23 | 0 | CH |
| 1066 | 1067 | CH | 256 | 7 | 1.86 | 2.18 | 0.0 | 0.00 | 0 | 0 | 0.670258 | 2.18 | 1.86 | 0.32 | Yes | 0.000000 | 0.000000 | 0.32 | 0 | CH |
| 1067 | 1068 | MM | 257 | 7 | 1.86 | 2.18 | 0.0 | 0.00 | 0 | 0 | 0.736206 | 2.18 | 1.86 | 0.32 | Yes | 0.000000 | 0.000000 | 0.32 | 0 | CH |
| 1068 | 1069 | CH | 261 | 7 | 1.86 | 2.13 | 0.0 | 0.24 | 0 | 0 | 0.588965 | 1.89 | 1.86 | 0.03 | Yes | 0.112676 | 0.000000 | 0.27 | 0 | CH |
| 1069 | 1070 | CH | 270 | 1 | 1.86 | 2.18 | 0.0 | 0.00 | 0 | 0 | 0.671172 | 2.18 | 1.86 | 0.32 | No | 0.000000 | 0.000000 | 0.32 | 1 | CH |
100 rows × 20 columns
from pycaret.utils import check_metric
check_metric(prediction['Purchase'],prediction['Label'],metric='Accuracy')
0.81