目标跟踪实战：SORT、DeepSORT 与 ByteTrack 原理实现-程序员充电站

目标跟踪实战：SORT、DeepSORT 与 ByteTrack 原理实现

1. 引言

目标跟踪（Multi-Object Tracking, MOT）是计算机视觉的核心任务之一。在自动驾驶、视频监控、运动分析等场景中，需要在连续帧中维持每个目标的唯一身份。

核心挑战：检测器给出每帧的目标框，但不知道"第1帧的车A"和"第2帧的车A"是同一辆车。

技术演进：

SORT (2016) → DeepSORT (2017) → ByteTrack (2021) → BoT-SORT (2023) 卡尔曼滤波 +外观特征 低分检测利用 +相机补偿

2. SORT（Simple Online and Realtime Tracking）

2.1 核心流程

帧 t: 检测结果 D_t = {d_1, d_2, ...} 帧 t-1: 跟踪轨迹 T_{t-1} = {t_1, t_2, ...} 1. 预测：用卡尔曼滤波预测每个轨迹在帧 t 的位置 2. 匹配：用匈牙利算法将预测框与检测框匹配 3. 更新：匹配成功的轨迹用检测框更新 4. 创建：未匹配的检测创建新轨迹 5. 删除：长时间未匹配的轨迹删除

2.2 卡尔曼滤波

importnumpyasnpfromfilterpy.kalmanimportKalmanFilterclassKalmanBoxTracker:"""基于卡尔曼滤波的边界框跟踪器"""count=0def__init__(self,bbox):""" bbox: [x1, y1, x2, y2] → 转为 [cx, cy, s, r] cx, cy = 中心坐标 s = 面积 r = 宽高比 """self.kf=KalmanFilter(dim_x=7,dim_z=4)# 状态转移矩阵 Fself.kf.F=np.array([[1,0,0,0,1,0,0],[0,1,0,0,0,1,0],[0,0,1,0,0,0,1],[0,0,0,1,0,0,0],[0,0,0,0,1,0,0],[0,0,0,0,0,1,0],[0,0,0,0,0,0,1],])# 观测矩阵 Hself.kf.H=np.array([[1,0,0,0,0,0,0],[0,1,0,0,0,0,0],[0,0,1,0,0,0,0],[0,0,0,1,0,0,0],])# 噪声self.kf.R[2:,2:]*=10.self.kf.P[4:,4:]*=1000.self.kf.P*=10.self.kf.Q[-1,-1]*=0.01self.kf.Q[4:,4:]*=0.01# 初始化状态self.kf.x[:4]=self._bbox_to_z(bbox)self.time_since_update=0self.id=KalmanBoxTracker.count KalmanBoxTracker.count+=1self.history=[]self.hits=0self.hit_streak=0self.age=0def_bbox_to_z(self,bbox):"""[x1,y1,x2,y2] → [cx,cy,s,r]"""w=bbox[2]-bbox[0]h=bbox[3]-bbox[1]cx=bbox[0]+w/2.cy=bbox[1]+h/2.s=w*h r=w/float(h)returnnp.array([[cx],[cy],[s],[r]])def_z_to_bbox(self,z):"""[cx,cy,s,r] → [x1,y1,x2,y2]"""w=np.sqrt(z[2]*z[3])h=z[2]/wreturnnp.array([z[0]-w/2.,z[1]-h/2.,z[0]+w/2.,z[1]+h/2.]).reshape((1,4))defpredict(self):"""预测下一帧位置"""self.kf.predict()self.age+=1ifself.time_since_update>0:self.hit_streak=0self.time_since_update+=1returnself._z_to_bbox(self.kf.x)defupdate(self,bbox):"""用检测框更新"""self.time_since_update=0self.hits+=1self.hit_streak+=1self.kf.update(self._bbox_to_z(bbox))defget_state(self):returnself._z_to_bbox(self.kf.x)

2.3 SORT 主流程

fromscipy.optimizeimportlinear_sum_assignmentclassSORT:def__init__(self,max_age=5,min_hits=3,iou_threshold=0.3):self.max_age=max_age self.min_hits=min_hits self.iou_threshold=iou_threshold self.trackers=[]self.frame_count=0defupdate(self,detections):""" detections: (N, 5) — [x1, y1, x2, y2, score] 返回: (M, 5) — [x1, y1, x2, y2, track_id] """self.frame_count+=1# 预测已有轨迹predicted=[]fortrkinself.trackers:pred=trk.predict()predicted.append(pred[0])predicted=np.array(predicted)ifpredictedelsenp.empty((0,4))# 匹配：IoU + 匈牙利算法iflen(predicted)>0andlen(detections)>0:iou_matrix=self._iou_batch(detections[:,:4],predicted)row_idx,col_idx=linear_sum_assignment(-iou_matrix)# 过滤低 IoUmatched=[]unmatched_dets=list(range(len(detections)))unmatched_trks=list(range(len(predicted)))forr,cinzip(row_idx,col_idx):ifiou_matrix[r,c]>=self.iou_threshold:matched.append((r,c))unmatched_dets.remove(r)unmatched_trks.remove(c)else:matched=[]unmatched_dets=list(range(len(detections)))unmatched_trks=list(range(len(predicted)))# 更新匹配的轨迹ford,tinmatched:self.trackers[t].update(detections[d,:4])# 创建新轨迹fordinunmatched_dets:trk=KalmanBoxTracker(detections[d,:4])self.trackers.append(trk)# 删除旧轨迹self.trackers=[tfortinself.trackersift.time_since_update<=self.max_age]# 输出确认的轨迹results=[]fortrkinself.trackers:iftrk.hits>=self.min_hits:bbox=trk.get_state()[0]results.append([*bbox,trk.id])returnnp.array(results)ifresultselsenp.empty((0,5))def_iou_batch(self,bb_dets,bb_trks):"""计算 IoU 矩阵"""defbox_iou(a,b):x1=max(a[0],b[0])y1=max(a[1],b[1])x2=min(a[2],b[2])y2=min(a[3],b[3])inter=max(0,x2-x1)*max(0,y2-y1)area_a=(a[2]-a[0])*(a[3]-a[1])area_b=(b[2]-b[0])*(b[3]-b[1])returninter/(area_a+area_b-inter+1e-6)iou=np.zeros((len(bb_dets),len(bb_trks)))fordinrange(len(bb_dets)):fortinrange(len(bb_trks)):iou[d,t]=box_iou(bb_dets[d],bb_trks[t])returniou

3. DeepSORT

3.1 改进点

DeepSORT 在 SORT 基础上加入外观特征：

代价矩阵 = α × IoU代价 + (1-α) × 外观代价 外观特征：CNN 提取 128 维特征向量 特征库：每个轨迹维护一个特征队列（最近 100 帧） 匹配：余弦距离

3.2 外观特征提取

importtorchimporttorch.nnasnnfromtorchvision.modelsimportresnet50classFeatureExtractor(nn.Module):"""外观特征提取器"""def__init__(self,feature_dim=128):super().__init__()backbone=resnet50(pretrained=True)self.features=nn.Sequential(*list(backbone.children())[:-1])self.fc=nn.Linear(2048,feature_dim)defforward(self,images):""" images: (B, 3, 128, 64) — 裁剪的目标图像 返回: (B, 128) — 归一化特征 """feat=self.features(images).flatten(1)feat=self.fc(feat)feat=nn.functional.normalize(feat,dim=1)returnfeat

3.3 级联匹配

classDeepSORT:def__init__(self,max_age=70,nn_budget=100):self.max_age=max_age self.nn_budget=nn_budget self.tracks=[]self.feature_extractor=FeatureExtractor()defupdate(self,detections,features):""" detections: (N, 5) — [x1, y1, x2, y2, score] features: (N, 128) — 外观特征 """# 1. 预测fortrackinself.tracks:track.predict()# 2. 级联匹配（优先匹配更长时间未更新的轨迹）matched,unmatched_dets,unmatched_trks=self._cascade_match(detections,features)# 3. IoU 匹配（剩余的用 IoU 匹配）iflen(unmatched_dets)>0andlen(unmatched_trks)>0:iou_matched,unmatched_dets,unmatched_trks=self._iou_match(detections[unmatched_dets],unmatched_trks)matched.extend(iou_matched)# 4. 更新/创建/删除ford,tinmatched:self.tracks[t].update(detections[d],features[d])fordinunmatched_dets:self.tracks.append(Track(detections[d],features[d]))self.tracks=[tfortinself.tracksift.time_since_update<=self.max_age]returnself._get_results()def_cascade_match(self,detections,features):"""级联匹配"""matched=[]unmatched_dets=list(range(len(detections)))forageinrange(self.max_age+1):tracks_of_age=[ifori,tinenumerate(self.tracks)ift.time_since_update==age]ifnottracks_of_ageornotunmatched_dets:continue# 计算代价矩阵cost_matrix=self._cosine_distance(features[unmatched_dets],[self.tracks[t].featuresfortintracks_of_age])row_idx,col_idx=linear_sum_assignment(cost_matrix)new_matched=[]forr,cinzip(row_idx,col_idx):ifcost_matrix[r,c]<0.7:# 余弦距离阈值new_matched.append((unmatched_dets[r],tracks_of_age[c]))unmatched_dets.remove(unmatched_dets[r])matched.extend(new_matched)unmatched_trks=[iforiinrange(len(self.tracks))ifself.tracks[i].time_since_update>0andinotin[m[1]forminmatched]]returnmatched,unmatched_dets,unmatched_trks

4. ByteTrack

4.1 核心创新

ByteTrack 的关键洞察：低分检测框也有用！

传统方法： 高分检测 (>0.6) → 匹配跟踪 低分检测 (<0.6) → 直接丢弃 ByteTrack： 第一轮：高分检测 ↔ 已有轨迹 匹配 第二轮：低分检测 ↔ 剩余轨迹 匹配 第三轮：未匹配高分检测 → 创建新轨迹

4.2 实现

classByteTrack:def__init__(self,high_thresh=0.6,low_thresh=0.1,max_age=30):self.high_thresh=high_thresh self.low_thresh=low_thresh self.max_age=max_age self.tracks=[]self.track_id=0defupdate(self,detections):""" detections: (N, 6) — [x1, y1, x2, y2, score, class] """# 分为高分和低分检测high_dets=detections[detections[:,4]>=self.high_thresh]low_dets=detections[(detections[:,4]>=self.low_thresh)&(detections[:,4]<self.high_thresh)]# 预测fortrackinself.tracks:track.predict()# 第一轮：高分检测 ↔ 所有轨迹matched1,unmatched_tracks,unmatched_high=self._match(self.tracks,high_dets,thresh=0.3)# 第二轮：低分检测 ↔ 剩余轨迹remaining_tracks=[self.tracks[i]foriinunmatched_tracks]matched2,still_unmatched,unmatched_low=self._match(remaining_tracks,low_dets,thresh=0.5)# 更新匹配的轨迹fort_idx,d_idxinmatched1:self.tracks[t_idx].update(high_dets[d_idx])fort_idx,d_idxinmatched2:remaining_tracks[t_idx].update(low_dets[d_idx])# 创建新轨迹（仅高分检测）ford_idxinunmatched_high:self.tracks.append(Track(high_dets[d_idx],self.track_id))self.track_id+=1# 删除旧轨迹self.tracks=[tfortinself.tracksift.time_since_update<=self.max_age]returnself._get_results()def_match(self,tracks,detections,thresh):"""IoU + 匈牙利匹配"""ifnottracksorlen(detections)==0:return[],list(range(len(tracks))),list(range(len(detections)))# IoU 矩阵iou_matrix=self._compute_iou(tracks,detections)row_idx,col_idx=linear_sum_assignment(-iou_matrix)matched,unmatched_tracks,unmatched_dets=[],[],[]matched_tracks,matched_dets=set(),set()forr,cinzip(row_idx,col_idx):ifiou_matrix[r,c]>=thresh:matched.append((r,c))matched_tracks.add(r)matched_dets.add(c)unmatched_tracks=[iforiinrange(len(tracks))ifinotinmatched_tracks]unmatched_dets=[iforiinrange(len(detections))ifinotinmatched_dets]returnmatched,unmatched_tracks,unmatched_dets