dat <- read_csv("/Users/mariacuellar/Github/crim_data_analysis/data/philadelphia_house_prices.csv")
## Rows: 1000 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): neighborhood
## dbl (7): price, sqft, bedrooms, bathrooms, distance_center, crime_rate, has_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dat
## # A tibble: 1,000 × 8
## price sqft bedrooms bathrooms distance_center crime_rate has_garage
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 268. 1733. 3 3 2.93 6.98 1
## 2 218. 1801. 3 2 4.78 9.26 0
## 3 356. 2247. 4 3 2.51 7.52 1
## 4 242. 2228. 3 2 6.69 9.15 0
## 5 365. 2540. 4 2 4.45 8.73 1
## 6 447. 2652. 4 3 2.79 5.77 0
## 7 248. 2121. 3 2 7.26 9.56 0
## 8 289. 2180. 4 3 6.96 11 1
## 9 350. 1862. 3 3 1.73 6.73 0
## 10 274. 1874. 2 3 0.53 7.37 1
## # ℹ 990 more rows
## # ℹ 1 more variable: neighborhood <chr>
dat %>% ggplot(aes(x=price)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
dat %>% ggplot(aes(x=sqft)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
dat %>% ggplot(aes(x=price, fill=neighborhood)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
dat %>%
summarize(
mean_price = mean(price, na.rm = TRUE),
sd_price = sd(price, na.rm = TRUE),
mean_sqft = mean(sqft, na.rm = TRUE),
sd_sqft = sd(sqft, na.rm = TRUE)
)
## # A tibble: 1 × 4
## mean_price sd_price mean_sqft sd_sqft
## <dbl> <dbl> <dbl> <dbl>
## 1 289. 82.8 2000. 396.
dat %>% ggplot(aes(x=price, y=sqft)) + geom_point()
dat %>% summarize(correlation = cor(price, sqft))
## # A tibble: 1 × 1
## correlation
## <dbl>
## 1 0.572
dat %>% ggplot(aes(x=price, y=sqft, color=neighborhood)) + geom_point()
out <- lm(price ~ sqft, data=dat)
summary(out)
##
## Call:
## lm(formula = price ~ sqft, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -109.93 -31.32 -8.39 24.25 1111.17
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.109386 11.062594 4.53 6.62e-06 ***
## sqft 0.119626 0.005426 22.05 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 67.91 on 998 degrees of freedom
## Multiple R-squared: 0.3275, Adjusted R-squared: 0.3269
## F-statistic: 486.1 on 1 and 998 DF, p-value: < 2.2e-16