虽然我已经过滤掉了大部分的异常值/GPS误差,但仍然有一些仍然存在。随代码一起显示数据的图。正如你在第一张图片中所看到的,在Lat 37附近,有一些随机线从图中射出。我不知道如何才能进一步删除这些GPS错误,而不通过每个数据点(有超过6万),并删除它们。我冒昧地提供了第二张带有高度的散射图图像。谢谢!
library(tidyverse)
df <- TUVU.Lead_0412_2022
glimpse(df)
#View missing values from GPS error
is.na((df$location.long))
#Omit rows with missing geographical coordinates
new_df <- na.omit(df)
view(new_df)
write.csv(new_df, "Modified_TUVU_data.csv")
#Create a boxplot to view outliers
boxplot(new_df$argos.altitude)
install.packages("ggstatsplot")
library(ggstatsplot)
data("new_df")
#Identify and visualize argos.altitude outliers in df
boxplot(New_Modified_TUVU_data$argos.altitude)$out
#Find outlliers using statistical methods
Q <- quantile(new_df$argos.altitude, probs = c(.25, .75), na.rm = FALSE)
iqr <- IQR(new_df$argos.altitude)
up <- Q[2]+1.5*iqr # Upper Range
low<- Q[2]-1.5*iqr # Lower Range???
#Elimate OUtliers using subset function
eliminated<- subset(new_df, new_df$argos.altitude > (Q[2] - 1.5*iqr) & warpbreaks$breaks < (Q[2]+1.5*iqr))
#Visualize any remaining outliers
boxplot(eliminated)$out
#Subset until outliers no longer remain
new_new_df <- subset(eliminated, argos.altitude<3157)
#Final view of boxplot without outliers
boxplot(new_new_df$argos.altitude)$out
write.csv(new_new_df, "New_Modified_TUVU_data.csv")
attach(New_Modified_TUVU_data)
install.packages("scatterplot3d")
library(scatterplot3d)
attach(New_Modified_TUVU_data)
scatterplot3d(location.long, location.lat, argos.altitude,
highlight.3d = TRUE, angle = -100,
type = "h", main = "3D Scatterplot Example"
)
library(dplyr)
arrange(New_Modified_TUVU_data, argos.altitude)
arrange(New_Modified_TUVU_data, argos.altitude)
arrange(New_Modified_TUVU_data, desc(argos.altitude))
persp(New_Modified_TUVU_data$location.lat, New_Modified_TUVU_data$location.long, New_Modified_TUVU_data$argos.altitude)
install.packages("plotly")
library(plotly)
library(ggplot2)
ggplot(New_Modified_TUVU_data, aes(location.lat, location.long)) +
geom_path()


发布于 2022-07-26 09:21:42
首先,你需要定义哪个GPS点是一个离群点,然后你应该考虑如何消除这样的点。例如,假设距离最近的其他点超过10米的点是一个离群点。因此,可以使用DBSCAN算法来识别这些点。下面是一个小例子:
library(ggplot2)
df_lonlat <- data.frame(latitude = c(57.034491, 57.034084, 57.034043,
57.034014, 57.034230, 57.034177,
57.034124, 57.034224, 57.034234,
57.034432),
longitude = c(24.151452, 24.152116, 24.152202,
24.152352, 24.153221, 24.153296,
24.153350, 24.153460, 24.153370,
24.153380))
# Transform data to spatial format
sf_lonlat <- sf::st_as_sf(x = df_lonlat,
coords = c("longitude", "latitude"),
crs = 4326) # https://epsg.io/4326
# switch coordinates to Cartesian coordinate system (i.e., x,y format)
# for other countries target coordinate reference system (crs) will be different
# this transformation is used because you can't use DBSCAN for Long/Lat numbers
sf_lonlat <- sf::st_transform(x = sf_lonlat,
crs = 3059) # https://epsg.io/3059
# Calculate DBSCAN clusters (and find outliers)
dbscan_result <- dbscan::dbscan(x = sf::st_coordinates(sf_lonlat),
eps = 10, # 10 meter distance. Points with larger distance will be classified in different cluster or will be classified as outlier
minPts = 2) # here used 2 points, because df_lonlat has only 10 rows - small sample.
# Add DBSCAN clusters to df_lonlat and plot results
df_lonlat$cluster <- dbscan_result$cluster
# "0 cluster" are outliers, all other cluster numbers can be ignored
ggplot(df_lonlat, mapping = aes(x = latitude, y = longitude,
color = as.factor(cluster))) +
geom_point() +
labs(x = 'Latitude',
y = 'Longitude',
title = 'Long/Lat coordinates and outliers',
color = 'Cluster') +
theme_light()我没有原始数据(您将进行大量调整),但我希望您理解这段代码背后的逻辑。上面的代码是用RVersion4.1.0构建的,下面是R包版本:sf (1.0.5), dbscan (1.1.8), ggplot2 (3.3.5)。
https://stackoverflow.com/questions/73119538
复制相似问题