# SPDS
library(tidyverse)
library(sf)
library(units)
For future area calculations, I transformed the projection to an equal area projection.
counties = USAboundaries::us_counties() %>%
filter(!(state_name %in% c("Puerto Rico", "Alaska", "Hawaii"))) %>%
st_transform(5070)
These are needed for triangle based tesselations to serve as “anchors”.
#Get centroids
county_cent = st_centroid(counties) %>%
st_combine()
Tessellations/Coverages describe the extent of a region with geometric shapes, called tiles, with no overlaps or gaps. Tiles can range in size, shape, area and have different methods for being created. Some methods generate triangular tiles across a set of defined points (e.g. voroni and delauny triangulation) Others generate equal area tiles over a known extent (st_make_grid)
For this project, I will create surfaces of CONUS using using 4 methods, 2 based on an extent and 2 based on point anchors:
Tessellations:
Coverages:
#Make Voronoi polygons
v_grid = st_voronoi(county_cent) %>%
st_cast() %>%
st_as_sf() %>%
mutate(id = 1:n())
plot(v_grid)
#Make Delaunay triangulation
t_grid = st_triangulate(county_cent) %>%
st_cast() %>%
st_as_sf() %>%
mutate(id = 1:n())
plot(t_grid)
#Make gridded coverage
sq_grid = st_make_grid(county_cent, n = c(70, 70)) %>%
st_as_sf() %>%
mutate(id = 1:n())
#Make hexagonal coverage
hex_grid = st_make_grid(county_cent, n = c(70, 70), square = FALSE) %>%
st_as_sf() %>%
mutate(id = 1:n())
As shown by the plots of the above tessellations, you’ll see the triangulated surfaces produce regions far beyond the boundaries of CONUS. We need to cut these boundaries to CONUS border.
To do this, I used st_intersection, but will first need a geometry of CONUS to serve as our differencing feature. We can get this by unioning our existing county boundaries.
#Simplify feature boundary
simp_counties = st_union(counties)
The more points our geometry contains, the more computations needed for spatial predicates our differencing. For a task like this, we do not need a finely resolved coastal boarder. I simplified the unioned border using the Visvalingam algotithm provided by rmapshaper::ms_simplify, and used this border to crop the two triangulated tesselations.
simp_counties = simp_counties %>%
rmapshaper::ms_simplify(keep = 0.1)
Here, I use mapview::npts function to report the number of points in the original object, and the number of points in the simplified object.
#Number of points in original object:
mapview::npts(st_union(counties))
## [1] 3229
#Number of points in simplified object:
mapview::npts(simp_counties)
## [1] 322
#Difference in points
paste("The difference in points between the original object and the simplified object is", mapview::npts(st_union(counties)) - mapview::npts(simp_counties))
## [1] "The difference in points between the original object and the simplified object is 2907"
#Crop triangulated tesselations
v_grid = st_intersection(v_grid, simp_counties)
plot(v_grid)
t_grid = st_intersection(t_grid, simp_counties)
plot(t_grid)
*****
Making a function to plot the tesselations/coverages allows the code to be written only once and then re-used, which reduces time required to code and the chances of making mistakes in the code.
#Ploting tesselations function
plot_tess = function(data, title){
ggplot() +
geom_sf(data = data, fill = "white", col = "navy", size = .2) +
theme_void() +
labs(title = title, caption = paste("This tesselation has:", nrow(data), "tiles" )) +
theme(plot.title = element_text(hjust = .5, color = "navy", face = "bold"))
}
#Make ggplots
plot_tess(counties, "Original County Data")
plot_tess(v_grid, "Voronoi Coverage")
plot_tess(t_grid, "Triangulation Coverage")
plot_tess(sq_grid, "Square Grid Coverage")
plot_tess(hex_grid, "Hexagonal Grid Coverage")
*****
sum_tess = function(data, title){
newdata = data %>%
mutate(number_of_features = nrow(data)) %>%
mutate(mean_area = mean(drop_units(set_units(st_area(data), "km2")))) %>%
mutate(sd_area = sd(drop_units(set_units(st_area(data), "km2")))) %>%
mutate(total_area = mean_area * number_of_features)
newdata$tesselation <- title
newdata = newdata %>%
select(tesselation, number_of_features, mean_area, sd_area, total_area) %>%
st_drop_geometry() %>%
head(1)
return(newdata)
}
sum_tess(counties, "Original Counties")
## tesselation number_of_features mean_area sd_area total_area
## 1 Original Counties 3108 2521.745 3404.325 7837583
sum_tess(v_grid, "Voroni")
## tesselation number_of_features mean_area sd_area total_area
## 1 Voroni 3107 2522.865 2885.827 7838541
sum_tess(t_grid, "Triangulation")
## tesselation number_of_features mean_area sd_area total_area
## 1 Triangulation 6196 1252.506 1576.11 7760528
sum_tess(sq_grid, "Square Grid")
## tesselation number_of_features mean_area sd_area total_area
## 1 Square Grid 2013 2558.283 1.30149e-11 5149823
sum_tess(hex_grid, "Hexagonal Grid")
## tesselation number_of_features mean_area sd_area total_area
## 1 Hexagonal Grid 1639 3586.119 1.87566e-11 5877648
Multiple data.frame objects can bound row-wise with bind_rows into a single data.frame.
tess_summary = bind_rows(
sum_tess(counties, "Original Counties"),
sum_tess(v_grid, "Voroni"),
sum_tess(t_grid, "Triangulation"),
sum_tess(sq_grid, "Square Grid"),
sum_tess(hex_grid, "Hexagonal Grid"))
tess_summary %>%
knitr::kable(caption = "Tesselation Coverage of the US",
col.names = c("tesselation", "number_of_features", "mean_area", "sd_area", "total_area"),
format.args = list(big.mark = ",")) %>%
kableExtra::kable_styling("bordered", full_width = F, font_size = 14)
tesselation | number_of_features | mean_area | sd_area | total_area |
---|---|---|---|---|
Original Counties | 3,108 | 2,521.745 | 3,404.325 | 7,837,583 |
Voroni | 3,107 | 2,522.865 | 2,885.827 | 7,838,541 |
Triangulation | 6,196 | 1,252.506 | 1,576.110 | 7,760,528 |
Square Grid | 2,013 | 2,558.283 | 0.000 | 5,149,823 |
Hexagonal Grid | 1,639 | 3,586.119 | 0.000 | 5,877,648 |
The original county data contains about 3,100 features. The Voronoi tesselation has a similar number, and therefore similar values for mean, standard deviation, and total area. The Delaunay triagnulation tesselation however, has almost double the number of features, and therefore a smaller mean area per feature. Given the area of the features of this tesselation is generally smaller, the counts for a point-in-polygon analysis would be smaller. The square and hexagonal grid coverages both have a smaller number of features than the original county coverage, although the mean area of the square grid coverage is similar. This is because both of these coverages have a much smaller total area, which would suggest that they lose some area of the country at the edges of the country. However, looking at the plots from the first question, it appears that there’s some missing coverage in the midwest in these tesselations, which will impact the results of a point-in-polygon analysis because there may be points that would have fallen in those areas that will be missing.
The data I am going to analyze in this lab is from US Army Corp of Engineers National Dam Inventory (NID). This dataset documents ~91,000 dams in the United States and a variety of attribute information including design specifications, risk level, age, and purpose.
For the remainder of this lab we will analysis the distributions of these dams (part 3) and their purpose (part 4) through using a point-in-polygon analysis.
library(readxl)
dams <- read_excel("../data/NID2019_U.xlsx")
dams = dams %>%
filter(!is.na(LONGITUDE), !is.na(LATITUDE)) %>%
st_as_sf(coords = c("LONGITUDE", "LATITUDE"), crs = 4326) %>%
st_transform(5070)
#Transform to same projection that has been used throughout the lab
point_in_polygon = function(points, polygon, group){
st_join(polygon, points) %>%
st_drop_geometry() %>%
count(get(group)) %>%
setNames(c(group, "n")) %>%
left_join(polygon, by = group) %>%
st_as_sf()
}
counties = point_in_polygon(dams, counties, "geoid")
v_grid = point_in_polygon(dams, v_grid, "id")
t_grid = point_in_polygon(dams, t_grid, "id")
sq_grid = point_in_polygon(dams, sq_grid, "id")
hex_grid = point_in_polygon(dams, hex_grid, "id")
plot_point_tess = function(data, title){
ggplot() +
geom_sf(data = data, aes(fill = n), col = NA, size = .2) +
scale_fill_viridis_c("Number of Dams")+
theme_void() +
labs(title = title,
caption = paste0(sum(data$n), " dams represented")) +
theme(plot.title = element_text(hjust = .5, color = "navy", face = "bold"))
}
plot_point_tess(counties, "Dam Densities across Counties")
plot_point_tess(v_grid, "Dam Densities across a Voronoi Tesselation")
plot_point_tess(t_grid, "Dam Densities across a Triangulated Tesselation")
plot_point_tess(sq_grid, "Dam Densities across a Square Grid")
plot_point_tess(hex_grid, "Dam Densities across a Hexagonal Grid")
*****
I chose to use the Voronoi tesselation moving forwards for the rest of this lab because the Delaunay Triangulation tesselation creates almost double the number of features than the other tesselations, as shown in part 2.4. At a scale as large as the US, it beceomes difficult to see the differences in such small features. In the Voronoi tesselation, having fewer features allows for a more obvious determination of where the high densities of dams are in the country. I didn’t chose the square or hexagonal grid tesselations because the coverage was very spotty in the Midwest.
The NID provides a comprehensive data dictionary here. In it we find that dam purposes are designated by a character code.
In the data dictionary, we see a dam can have multiple purposes. In these cases, the purpose codes are concatenated in order of decreasing importance. For example, SCR would indicate the primary purposes are Water Supply, then Flood Control, then Recreation.
A standard summary indicates there are over 400 unique combinations of dam purposes:
unique(dams$PURPOSES) %>% length
## [1] 495
I chose to analyze Recreation (R), Flood Control (C), Fire Protection (P), and Water Supply (S) because they are the 4 most common purposes of dams in the US (as illustrated in the graph above).
r_dams = dams %>%
filter(grepl("R", dams$PURPOSES) == TRUE)
c_dams = dams %>%
filter(grepl("C", dams$PURPOSES) == TRUE)
p_dams = dams %>%
filter(grepl("P", dams$PURPOSES) == TRUE)
s_dams = dams %>%
filter(grepl("S", dams$PURPOSES) == TRUE)
Use gghighlight to highlight those tiles where the count (n) is greater then the mean + 1 standard deviation of the set. The result of this exploration is to highlight the areas of the country with the most
#drop previous n column from v_grid
v_grid = v_grid %>%
select(id, x)
point_in_polygon(r_dams, v_grid, "id") %>%
plot_point_tess("Dams used for Recreation")+
gghighlight::gghighlight(n > (mean(n)+ sd(n)))
point_in_polygon(c_dams, v_grid, "id") %>%
plot_point_tess("Dams used for Flood Control")+
gghighlight::gghighlight(n > (mean(n)+ sd(n)))
point_in_polygon(p_dams, v_grid, "id") %>%
plot_point_tess("Dams used for Fire Protection")+
gghighlight::gghighlight(n > (mean(n)+ sd(n)))
point_in_polygon(s_dams, v_grid, "id") %>%
plot_point_tess("Dams used for Water Supply")+
gghighlight::gghighlight(n > (mean(n)+ sd(n)))
*****
By subsetting the dam data, patterns can become clear not only of where dams are most prevalent, but what types of dams are most prevalent in certain areas. Dams used for recreation appear to be more common in the Eastern states, while dams used for fire protection are more concentrated in the middle and north of the country, and more dams are used for water supply in the west. The prevalence of dams used for water supply in the west makes sense given the dry climate of California in many places. The number of dams used for flood control appears to be particularly high around the Mississippi river system, which also makes sense.
The choice of a Voronoi tesselation may have impacted these results. Had the Delaunay triangulation tesselation been used, the distribution of features with numbers of dams above the mean + 1 standard deviation of the set may have been impacted because the Delaunay triangulation has so many more features.