diff --git a/Lessons/Lesson22_Basic_Stats_II_Percents.ipynb b/Lessons/Lesson22_Basic_Stats_II_Percents.ipynb index 34261c6..2f323b5 100644 --- a/Lessons/Lesson22_Basic_Stats_II_Percents.ipynb +++ b/Lessons/Lesson22_Basic_Stats_II_Percents.ipynb @@ -83,7 +83,7 @@ "id": "5ADm2TV-s7VG" }, "source": [ - "**Example 2:** Let's learn to calculate percentages by using real world data. We will work with a dataset of Boston housing prices." + "**Example 2:** Let's learn to calculate percentages by using real world data. We will work with a dataset of Ames, Iowa housing prices." ] }, { @@ -96,8 +96,9 @@ }, "outputs": [], "source": [ - "# Import the load_boston method \n", - "from sklearn.datasets import load_boston" + "# Import the fetch_openml method \n", + "from sklearn.datasets import fetch_openml\n", + "housing = fetch_openml(name=\"house_prices\", as_frame=True, parser=\"auto\")" ] }, { @@ -110,7 +111,7 @@ }, "outputs": [], "source": [ - "# Import pandas, so that we can work with the data frame version of the Boston housing data\n", + "# Import pandas, so that we can work with the data frame version of the Ames housing data\n", "import pandas as pd" ] }, @@ -125,12 +126,10 @@ }, "outputs": [], "source": [ - "# Load the dataset of housing prices in Boston, and convert to\n", + "# Load the dataset of house prices in Ames, and convert to\n", "# a data frame format so it's easier to view and process\n", - "boston = load_boston()\n", - "boston_df = pd.DataFrame(boston['data'], columns = boston['feature_names'])\n", - "boston_df['PRICE'] = boston.target\n", - "boston_df" + "ames_df = pd.DataFrame(housing['data'])\n", + "ames_df" ] }, { @@ -140,7 +139,20 @@ "id": "eyMUHGews7VZ" }, "source": [ - "CHAS is the indicator variable we used last week, where 1 indicates that the property (tract) is on the Charles River and 0 means otherwise." + "The `SaleCondition` column lists the condition of the house sale:\n", + "\n", + "\n", + "* `Normal`: Normal Sale \n", + "\n", + "* `Abnorml`: Abnormal Sale - trade, foreclosure, short sale\n", + "\n", + "* `AdjLand`: Adjoining Land Purchase\n", + "\n", + "* `Alloca`: Allocation - two linked properties with separate deeds, typically condo with a garage unit\n", + "\n", + "* `Family`: Sale between family members \n", + "\n", + "* `Partial`: Home was not completed when last assessed (associated with New Homes)\n" ] }, { @@ -150,7 +162,7 @@ "id": "IMpeHBEzs7VZ" }, "source": [ - "What percentage of the tracts bound the Charles River? We'll see how to do this using the query method AND using boolean indexing." + "What percentage of the houses were sold normally? We'll see how to do this using the query method AND using boolean indexing." ] }, { @@ -200,10 +212,10 @@ }, "outputs": [], "source": [ - "# Determine the total number of tracts in the dataset\n", + "# Determine the total number of houses in the dataset\n", "\n", "\n", - "# Now calculate the percentage of tracts that bounds the Charles River.\n" + "# Now calculate the percentage of houses sold normally.\n" ] }, { @@ -226,12 +238,12 @@ "id": "kFGToww_s7Vg" }, "source": [ - "What percentage of tracts have a median price less than $10,000?" + "What percentage of houses have a price less than $200,000?" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "colab": {}, "colab_type": "code", @@ -239,10 +251,10 @@ }, "outputs": [], "source": [ - "# Determine number of tracts that cost less than $10,000\n", + "# Determine number of houses that cost less than $200,000\n", "\n", "\n", - "# Calculate the percentage of tracts that cost less than $10k.\n" + "# Calculate the percentage of houses that cost less than $200k.\n" ] }, { @@ -252,7 +264,7 @@ "id": "RLZ-k3L7s7Vq" }, "source": [ - "What percentage of tracts have a median price **between** \\$10,000 and \\$30,000?" + "What percentage of tracts have a median price **between** $200,000 and $500,000?" ] }, { @@ -265,13 +277,13 @@ }, "outputs": [], "source": [ - "# Make an array of booleans with cost greater than $10,000 AND less than $30,000\n", + "# Make an array of booleans with cost greater than $200,000 AND less than $500,000\n", "\n", "\n", - "# Determine number of tracts that cost between $10,000 and $30,000\n", + "# Determine number of houses that cost between $200,000 and $500,000\n", "\n", "\n", - "# Calculate the percentage of tracts between $10,000 and $30,000\n" + "# Calculate the percentage of houses between $200,000 and $500,000\n" ] }, { @@ -301,7 +313,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.9.12" } }, "nbformat": 4,