{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Regularized Linear Models" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib\n", "\n", "import matplotlib.pyplot as plt\n", "from scipy.stats import skew\n", "from scipy.stats import pearsonr\n", "\n", "\n", "%config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import requests\n", "import zipfile\n", "train_url = \"https://static-1300131294.cos.ap-shanghai.myqcloud.com/data/ml-advanced/model-selection/Regularized-Linear-Models/train.csv\"\n", "test_url = \"https://static-1300131294.cos.ap-shanghai.myqcloud.com/data/ml-advanced/model-selection/Regularized-Linear-Models/test.csv\"\n", "\n", "notebook_path = os.getcwd()\n", "\n", "tmp_folder_path = os.path.join(notebook_path, \"tmp\")\n", "\n", "if not os.path.exists(tmp_folder_path):\n", " os.makedirs(tmp_folder_path)\n", "\n", "file_path = os.path.join(tmp_folder_path,\"regularized-linear-models\")\n", "\n", "if not os.path.exists(file_path):\n", " os.makedirs(file_path)\n", "\n", "zip_store_path = os.path.join(file_path, \"zip-store\")\n", "\n", "if not os.path.exists(zip_store_path):\n", " os.makedirs(zip_store_path)\n", "\n", "train_response = requests.get(train_url)\n", "test_response = requests.get(test_url)\n", "\n", "train_name = os.path.basename(train_url)\n", "test_name = os.path.basename(test_url)\n", "\n", "train_save_path = os.path.join(file_path, train_name)\n", "test_save_path = os.path.join(file_path, test_name)\n", "\n", "with open(train_save_path, \"wb\") as file:\n", " file.write(train_response.content)\n", " \n", "with open(test_save_path, \"wb\") as file:\n", " file.write(test_response.content)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "train = pd.read_csv(\"./tmp/regularized-linear-models/train.csv\")\n", "test = pd.read_csv(\"./tmp/regularized-linear-models/test.csv\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Id | \n", "MSSubClass | \n", "MSZoning | \n", "LotFrontage | \n", "LotArea | \n", "Street | \n", "Alley | \n", "LotShape | \n", "LandContour | \n", "Utilities | \n", "... | \n", "PoolArea | \n", "PoolQC | \n", "Fence | \n", "MiscFeature | \n", "MiscVal | \n", "MoSold | \n", "YrSold | \n", "SaleType | \n", "SaleCondition | \n", "SalePrice | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "60 | \n", "RL | \n", "65.0 | \n", "8450 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Lvl | \n", "AllPub | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "2 | \n", "2008 | \n", "WD | \n", "Normal | \n", "208500 | \n", "
1 | \n", "2 | \n", "20 | \n", "RL | \n", "80.0 | \n", "9600 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Lvl | \n", "AllPub | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "5 | \n", "2007 | \n", "WD | \n", "Normal | \n", "181500 | \n", "
2 | \n", "3 | \n", "60 | \n", "RL | \n", "68.0 | \n", "11250 | \n", "Pave | \n", "NaN | \n", "IR1 | \n", "Lvl | \n", "AllPub | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "9 | \n", "2008 | \n", "WD | \n", "Normal | \n", "223500 | \n", "
3 | \n", "4 | \n", "70 | \n", "RL | \n", "60.0 | \n", "9550 | \n", "Pave | \n", "NaN | \n", "IR1 | \n", "Lvl | \n", "AllPub | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "2 | \n", "2006 | \n", "WD | \n", "Abnorml | \n", "140000 | \n", "
4 | \n", "5 | \n", "60 | \n", "RL | \n", "84.0 | \n", "14260 | \n", "Pave | \n", "NaN | \n", "IR1 | \n", "Lvl | \n", "AllPub | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "12 | \n", "2008 | \n", "WD | \n", "Normal | \n", "250000 | \n", "
5 rows × 81 columns
\n", "XGBRegressor(base_score=None, booster=None, callbacks=None,\n", " colsample_bylevel=None, colsample_bynode=None,\n", " colsample_bytree=None, early_stopping_rounds=None,\n", " enable_categorical=False, eval_metric=None, feature_types=None,\n", " gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n", " interaction_constraints=None, learning_rate=0.1, max_bin=None,\n", " max_cat_threshold=None, max_cat_to_onehot=None,\n", " max_delta_step=None, max_depth=2, max_leaves=None,\n", " min_child_weight=None, missing=nan, monotone_constraints=None,\n", " n_estimators=360, n_jobs=None, num_parallel_tree=None,\n", " predictor=None, random_state=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
XGBRegressor(base_score=None, booster=None, callbacks=None,\n", " colsample_bylevel=None, colsample_bynode=None,\n", " colsample_bytree=None, early_stopping_rounds=None,\n", " enable_categorical=False, eval_metric=None, feature_types=None,\n", " gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n", " interaction_constraints=None, learning_rate=0.1, max_bin=None,\n", " max_cat_threshold=None, max_cat_to_onehot=None,\n", " max_delta_step=None, max_depth=2, max_leaves=None,\n", " min_child_weight=None, missing=nan, monotone_constraints=None,\n", " n_estimators=360, n_jobs=None, num_parallel_tree=None,\n", " predictor=None, random_state=None, ...)