pNEUMA轨迹数据处理¶
这个案例的Jupyter notebook: 点击这里.
在这个例子中,我们将示例如何将``TransBigData``融入到雅典pNEUMA轨迹数据集的处理与可视化中。
请注意,样本数据已经被经过了一定的处理。原始版本的数据集可以在这里下载。
website
import transbigdata as tbd
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
读取数据¶
轨迹数据¶
# 读取数据
data = pd.read_csv('data/pNEUMA_tbd_sample.csv')
# 将时间戳转换为时间格式
data['time'] = pd.to_datetime(data['time'], unit='s')
data.head()
track_id | lon | lat | speed | time | |
---|---|---|---|---|---|
0 | 128 | 23.730362 | 37.990046 | 12.5845 | 1970-01-01 00:00:00.000 |
1 | 128 | 23.730364 | 37.990045 | 12.4935 | 1970-01-01 00:00:00.040 |
2 | 128 | 23.730366 | 37.990045 | 12.3965 | 1970-01-01 00:00:00.080 |
3 | 128 | 23.730367 | 37.990045 | 12.2949 | 1970-01-01 00:00:00.120 |
4 | 128 | 23.730369 | 37.990044 | 12.1910 | 1970-01-01 00:00:00.160 |
# 输出数据大小信息
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581244 entries, 0 to 581243
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 track_id 581244 non-null int64
1 lon 581244 non-null float64
2 lat 581244 non-null float64
3 speed 581244 non-null float64
4 time 581244 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(3), int64(1)
memory usage: 22.2 MB
OSM路网数据获取¶
你可以直接从``data``文件夹加载道路数据,或者使用OSMNX下载路网 OSMNX
# 从OSMNX中获取路网数据
# OSM Graph
import osmnx as ox
bounds = [23.723577, 37.975462, 23.738471, 37.993053]
north, south, east, west = bounds[3], bounds[1], bounds[2], bounds[0]
G = ox.graph_from_bbox(north, south, east, west, network_type='drive')
# 获取点和边
nodes, edges = ox.graph_to_gdfs(G, nodes=True, edges=True)
# 存储路网数据
filepath = "data/pNEUMA_network.graphml"
ox.save_graphml(G, filepath)
如果你没有OSMNX,可以运行下面代码读取已经现成的数据
# 读取OSM数据
import osmnx as ox
filepath = "data/pNEUMA_network.graphml"
G = ox.load_graphml(filepath)
# 获取点和边
nodes, edges = ox.graph_to_gdfs(G, nodes=True, edges=True)
地图底图加载¶
将地图底图加载并可视化
# 可视化地图底图 tbd.plot_map
bounds = [23.723577, 37.975462, 23.738471, 37.993053]
fig = plt.figure(1, (12, 8), dpi=100)
ax = plt.subplot(121)
plt.sca(ax)
tbd.plot_map(plt, bounds, zoom=18, style=1) # the map
edges.plot(ax=ax, lw=1, color='grey') # edges
nodes.plot(ax=ax, markersize = 8, color='red') # nodes
plt.axis('off');
ax = plt.subplot(122)
plt.sca(ax)
tbd.plot_map(plt, bounds, zoom=18, style=5) # the map
edges.plot(ax=ax, lw=1, color='grey') # edges
nodes.plot(ax=ax, markersize = 8, color='red') # nodes
plt.axis('off');

数据清洗¶
数据稀疏化¶
数据集的采样间隔为 \(0.04\) 秒, 非常小,不便于处理。
然而,一些宏观层面的研究不需要如此高的采样间隔。在这种情况下,数据可以使用``tbd.traj_sparsify``进行稀疏化。
# 原始数据
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581244 entries, 0 to 581243
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 track_id 581244 non-null int64
1 lon 581244 non-null float64
2 lat 581244 non-null float64
3 speed 581244 non-null float64
4 time 581244 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(3), int64(1)
memory usage: 22.2 MB
#轨迹稀疏化
data_sparsify = tbd.traj_sparsify(data, col=['track_id', 'time', 'lon', 'lat'],timegap=0.4,method='subsample')
data_sparsify.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 23293 entries, 0 to 581229
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 track_id 23293 non-null int64
1 lon 23293 non-null float64
2 lat 23293 non-null float64
3 speed 23293 non-null float64
4 time 23293 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(3), int64(1)
memory usage: 1.1 MB
冗余数据剔除¶
在车辆停止运行时,位置没有发生移动,但仍然会产生大量GPS点,这些静止的GPS点除第一和最后一个点外的都可以删除。
#用 tbd.clean_same 删除冗余数据
data_sparsify_clean = tbd.clean_same(data_sparsify, col=['track_id', 'time', 'lon', 'lat'])
data_sparsify_clean.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10674 entries, 0 to 581229
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 track_id 10674 non-null int64
1 lon 10674 non-null float64
2 lat 10674 non-null float64
3 speed 10674 non-null float64
4 time 10674 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(3), int64(1)
memory usage: 500.3 KB
data_sparsify_clean.head()
track_id | lon | lat | speed | time | |
---|---|---|---|---|---|
0 | 128 | 23.730362 | 37.990046 | 12.5845 | 1970-01-01 00:00:00 |
25 | 128 | 23.730399 | 37.990040 | 10.6835 | 1970-01-01 00:00:01 |
50 | 128 | 23.730429 | 37.990036 | 7.8580 | 1970-01-01 00:00:02 |
75 | 128 | 23.730443 | 37.990033 | 1.2661 | 1970-01-01 00:00:03 |
1775 | 128 | 23.730443 | 37.990033 | 0.0027 | 1970-01-01 00:01:11 |
数据可视化¶
gdf_data = gpd.GeoDataFrame(data_sparsify_clean,
geometry=gpd.points_from_xy(data_sparsify_clean['lon'],
data_sparsify_clean['lat']),
crs=4326)
gdf_data.head()
track_id | lon | lat | speed | time | geometry | |
---|---|---|---|---|---|---|
0 | 128 | 23.730362 | 37.990046 | 12.5845 | 1970-01-01 00:00:00 | POINT (23.73036 37.99005) |
25 | 128 | 23.730399 | 37.990040 | 10.6835 | 1970-01-01 00:00:01 | POINT (23.73040 37.99004) |
50 | 128 | 23.730429 | 37.990036 | 7.8580 | 1970-01-01 00:00:02 | POINT (23.73043 37.99004) |
75 | 128 | 23.730443 | 37.990033 | 1.2661 | 1970-01-01 00:00:03 | POINT (23.73044 37.99003) |
1775 | 128 | 23.730443 | 37.990033 | 0.0027 | 1970-01-01 00:01:11 | POINT (23.73044 37.99003) |
# 获取有最多数据点的车辆
gdf_count = gdf_data.groupby('track_id')['lon'].count().rename('count').sort_values(ascending=False).reset_index()
print(list(gdf_count.iloc[:20]['track_id']))
[2138, 3290, 1442, 3197, 4408, 1767, 5002, 5022, 2140, 347, 2584, 4750, 4542, 2431, 4905, 4997, 1329, 4263, 1215, 3400]
可视化车辆
fig = plt.figure(1, (6, 8), dpi=100)
ax = plt.subplot(111)
plt.sca(ax)
# map
tbd.plot_map(plt, bounds, zoom=18, style=4) # the map
edges.plot(ax=ax, lw=1, color='grey') # edges
# nodes.plot(ax=ax, markersize = 6, color='red') # nodes
# trajectory
gdf_data.plot(column='speed', ax=ax, markersize=0.5)
plt.axis('off');

可视化单辆车,并显示最短路径
# select a vehicle
tmpgdf_data = gdf_data[gdf_data['track_id']==2138]
# the origin / destination location
# o_point = [tmpgdf_data.iloc[0]['lon'], tmpgdf_data.iloc[0]['lat']]
# d_point = [tmpgdf_data.iloc[-1]['lon'], tmpgdf_data.iloc[-1]['lat']]
# get the nearest node of each point on the map
tmpgdf_data = tbd.ckdnearest_point(tmpgdf_data, nodes)
# extract the o/d node
o_index, d_index = tmpgdf_data.iloc[0]['index'], tmpgdf_data.iloc[-1]['index']
o_node_id, d_node_id = list(nodes[nodes['index']==o_index].index)[0], \
list(nodes[nodes['index']==d_index].index)[0]
print(o_node_id, d_node_id)
tmpgdf_data.head()
250691723 358465943
track_id | lon | lat | speed | time | geometry_x | dist | index | y | x | street_count | highway | geometry_y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2138 | 23.735287 | 37.977435 | 42.1006 | 1970-01-01 00:01:35.560 | POINT (23.73529 37.97743) | 0.000779 | 145 | 37.978086 | 23.734859 | 4 | NaN | POINT (23.73486 37.97809) |
1 | 2138 | 23.735254 | 37.977473 | 41.8663 | 1970-01-01 00:01:36.000 | POINT (23.73525 37.97747) | 0.000729 | 145 | 37.978086 | 23.734859 | 4 | NaN | POINT (23.73486 37.97809) |
2 | 2138 | 23.735181 | 37.977558 | 39.9012 | 1970-01-01 00:01:37.000 | POINT (23.73518 37.97756) | 0.000618 | 145 | 37.978086 | 23.734859 | 4 | NaN | POINT (23.73486 37.97809) |
3 | 2138 | 23.735111 | 37.977638 | 37.7748 | 1970-01-01 00:01:38.000 | POINT (23.73511 37.97764) | 0.000514 | 145 | 37.978086 | 23.734859 | 4 | NaN | POINT (23.73486 37.97809) |
4 | 2138 | 23.735047 | 37.977712 | 33.8450 | 1970-01-01 00:01:39.000 | POINT (23.73505 37.97771) | 0.000418 | 145 | 37.978086 | 23.734859 | 4 | NaN | POINT (23.73486 37.97809) |
250691723 358465943
fig = plt.figure(1, (6, 8), dpi=100)
ax = plt.subplot(111)
plt.sca(ax)
# map
tbd.plot_map(plt, bounds, zoom=18, style=4) # the map
edges.plot(ax=ax, lw=1, color='grey') # edges
# nodes.plot(ax=ax, markersize = 6, color='red') # nodes
# trajectory
gdf_data[gdf_data['track_id']==2138].plot(ax=ax, markersize=5, color='red')
plt.axis('off');

我们可以将轨迹数据与最短路径做比对.
# the shortest path (optional)
# ax = plt.subplot(122)
# plt.sca(ax)
route = ox.shortest_path(G, o_node_id, d_node_id, weight="length")
plt, ax = ox.plot_graph_route(G, route, route_color="green", route_linewidth=8, node_size=0)
