dat <- read_csv("/Users/mariacuellar/Github/crim_data_analysis/data/philadelphia_house_prices.csv")
## Rows: 1000 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): neighborhood
## dbl (7): price, sqft, bedrooms, bathrooms, distance_center, crime_rate, has_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dat
## # A tibble: 1,000 × 8
##    price  sqft bedrooms bathrooms distance_center crime_rate has_garage
##    <dbl> <dbl>    <dbl>     <dbl>           <dbl>      <dbl>      <dbl>
##  1  268. 1733.        3         3            2.93       6.98          1
##  2  218. 1801.        3         2            4.78       9.26          0
##  3  356. 2247.        4         3            2.51       7.52          1
##  4  242. 2228.        3         2            6.69       9.15          0
##  5  365. 2540.        4         2            4.45       8.73          1
##  6  447. 2652.        4         3            2.79       5.77          0
##  7  248. 2121.        3         2            7.26       9.56          0
##  8  289. 2180.        4         3            6.96      11             1
##  9  350. 1862.        3         3            1.73       6.73          0
## 10  274. 1874.        2         3            0.53       7.37          1
## # ℹ 990 more rows
## # ℹ 1 more variable: neighborhood <chr>

1. Do visual EDA on price, and do visual EDA on square footage. Describe what you see.

dat %>% ggplot(aes(x=price)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

dat %>% ggplot(aes(x=sqft)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

2. Could there be a categorical variable that explains the strange features observed in price? Look for it in the data.

dat %>% ggplot(aes(x=price, fill=neighborhood)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

3. Do quantitative EDA on price and square footage, separately. Describe.

dat %>%
  summarize(
    mean_price = mean(price, na.rm = TRUE),
    sd_price   = sd(price, na.rm = TRUE),
    mean_sqft  = mean(sqft, na.rm = TRUE),
    sd_sqft    = sd(sqft, na.rm = TRUE)
  )
## # A tibble: 1 × 4
##   mean_price sd_price mean_sqft sd_sqft
##        <dbl>    <dbl>     <dbl>   <dbl>
## 1       289.     82.8     2000.    396.

4. Do visual EDA on price and square footage, together. Describe what you see.

dat %>% ggplot(aes(x=price, y=sqft)) + geom_point()

5. Do quantitative EDA on price and square footage, together. Describe.

dat %>% summarize(correlation = cor(price, sqft))
## # A tibble: 1 × 1
##   correlation
##         <dbl>
## 1       0.572

6. Repeat 3, but now add a color for the categorical variable you found in 2. Describe what you see.

dat %>% ggplot(aes(x=price, y=sqft, color=neighborhood)) + geom_point()

7. Fit a simple linear regression for price and square footage. What is your null hypothesis? Look at the summary. For now, don’t interpret the coefficients and don’t test for the assumptions.

out <- lm(price ~ sqft, data=dat)
summary(out)
## 
## Call:
## lm(formula = price ~ sqft, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -109.93  -31.32   -8.39   24.25 1111.17 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 50.109386  11.062594    4.53 6.62e-06 ***
## sqft         0.119626   0.005426   22.05  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 67.91 on 998 degrees of freedom
## Multiple R-squared:  0.3275, Adjusted R-squared:  0.3269 
## F-statistic: 486.1 on 1 and 998 DF,  p-value: < 2.2e-16