API错误处理与异常排查从入门到精通

2026-04-21 10:37:13作者：曹令琨Iris

API错误处理是保障服务稳定性和用户体验的关键环节。本文将系统讲解如何诊断API错误根源、构建弹性调用机制以及实施预防策略，帮助开发者建立完整的错误处理体系，提升系统可靠性。

诊断错误根源

API错误诊断是解决问题的第一步，需要建立系统化的分析流程。错误码体系是诊断的基础，KIMI API采用分层错误码设计，通过错误码即可初步定位问题类型。

错误码体系解析

KIMI API错误码分为系统级和API级两大类，涵盖从请求验证到服务调用的全流程错误场景：

系统级错误（-1000至-1099）

-1000: 系统异常 - 服务器内部处理错误，通常由服务端逻辑缺陷或资源耗尽导致
-1001: 请求参数校验错误 - 请求格式不符合API规范，如字段类型错误或缺失
-1002: 无匹配的路由 - 请求路径不正确或API版本不匹配

API级错误（-2000至-2099）

-2000: 请求参数非法 - 业务逻辑校验失败，如必填参数缺失
-2001: 请求失败 - 后端服务调用超时或返回非预期结果
-2002: Token已失效 - 身份认证信息过期或无效
-2003: 远程文件URL非法 - 文件链接格式错误或协议不支持
-2004: 远程文件超出大小 - 文件体积超过系统限制
-2005: 已有对话流正在输出 - 同一会话ID存在并发请求冲突
-2006: 探索版使用量已达到上限 - 免费额度耗尽

错误诊断流程

错误诊断应遵循"由表及里"的原则，从现象到本质逐步深入：

错误现象收集：记录完整的错误响应、请求参数和时间戳
错误码定位：根据错误码初步判断错误类型和责任方
日志分析：检查请求日志、服务日志和依赖服务日志
环境验证：确认开发/生产环境差异、依赖服务状态和网络连通性
代码调试：针对复杂问题进行本地复现和断点调试

典型错误案例分析

案例：Token失效错误（-2002）

问题现象：请求API时突然返回-2002错误，提示"Token已失效"，但此前一直正常运行。

排查步骤：

检查Token过期时间设置和实际存活时间
验证Token刷新机制是否正常工作
确认服务端Token黑名单是否误判
检查客户端时间是否与服务器同步

解决代码：

// 实现健壮的Token管理机制
class TokenManager {
  private token: string | null = null;
  private expiresAt: number = 0;
  private refreshThreshold = 60; // 提前60秒刷新
  
  async getValidToken(): Promise<string> {
    // 检查Token是否存在且有效
    if (!this.token || Date.now() >= this.expiresAt - this.refreshThreshold * 1000) {
      await this.refreshToken();
    }
    return this.token!;
  }
  
  private async refreshToken(): Promise<void> {
    try {
      const response = await fetch('/api/token/refresh', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ refreshToken: this.getRefreshToken() })
      });
      
      if (!response.ok) throw new Error('Token刷新失败');
      
      const data = await response.json();
      this.token = data.accessToken;
      this.expiresAt = Date.now() + data.expiresIn * 1000;
    } catch (error) {
      // 刷新失败时触发重新登录流程
      this.handleTokenRefreshFailure();
      throw error;
    }
  }
}

构建弹性调用机制

弹性调用机制是提升系统容错能力的核心手段，通过合理的错误处理策略和设计模式，使系统在面对错误时能够优雅降级或自动恢复。

错误处理设计模式

熔断器模式

熔断器模式可防止故障级联传播，当错误率超过阈值时自动"熔断"，停止对故障服务的调用，避免资源耗尽。

// 熔断器实现示例
class CircuitBreaker {
  private state: 'closed' | 'open' | 'half-open' = 'closed';
  private failureCount = 0;
  private successCount = 0;
  private failureThreshold = 5;  // 失败阈值
  private successThreshold = 3;  // 成功阈值
  private resetTimeout = 10000;  // 重置超时(10秒)
  private lastFailureTime = 0;
  
  async execute<T>(fn: () => Promise<T>): Promise<T> {
    // 根据当前状态决定执行策略
    if (this.state === 'open') {
      // 检查是否已过重置时间
      if (Date.now() - this.lastFailureTime > this.resetTimeout) {
        this.state = 'half-open';
      } else {
        throw new Error('服务暂时不可用，请稍后再试');
      }
    }
    
    try {
      const result = await fn();
      this.handleSuccess();
      return result;
    } catch (error) {
      this.handleFailure();
      throw error;
    }
  }
  
  private handleSuccess(): void {
    if (this.state === 'half-open') {
      this.successCount++;
      if (this.successCount >= this.successThreshold) {
        this.reset();  // 重置熔断器
      }
    } else {
      this.failureCount = 0;  // 闭合状态下重置失败计数
    }
  }
  
  private handleFailure(): void {
    this.lastFailureTime = Date.now();
    this.failureCount++;
    
    if (this.state === 'half-open') {
      this.state = 'open';  // 半开状态下失败直接进入打开状态
    } else if (this.failureCount >= this.failureThreshold) {
      this.state = 'open';  // 达到失败阈值进入打开状态
    }
  }
  
  private reset(): void {
    this.state = 'closed';
    this.failureCount = 0;
    this.successCount = 0;
  }
}

// 使用示例
const breaker = new CircuitBreaker();
try {
  const result = await breaker.execute(() => callKimiAPI());
} catch (error) {
  // 处理错误
}

退避策略

退避策略用于处理暂时性错误，通过指数退避算法控制重试间隔，避免加剧服务负载。

// 指数退避重试实现
async function withRetry<T>(
  fn: () => Promise<T>,
  maxRetries = 3,
  initialDelay = 1000
): Promise<T> {
  let lastError: Error;
  
  for (let i = 0; i <= maxRetries; i++) {
    try {
      return await fn();
    } catch (error) {
      lastError = error as Error;
      
      // 判断是否是可重试错误
      if (!isRetryableError(error)) break;
      
      // 达到最大重试次数，抛出错误
      if (i === maxRetries) break;
      
      // 计算退避延迟 (指数退避 + 随机抖动)
      const delay = initialDelay * Math.pow(2, i) * (0.5 + Math.random() * 0.5);
      await new Promise(resolve => setTimeout(resolve, delay));
    }
  }
  
  throw lastError;
}

// 判断是否是可重试错误
function isRetryableError(error: any): boolean {
  const retryableCodes = [-1000, -2001]; // 系统异常和请求失败可重试
  return error?.errcode && retryableCodes.includes(error.errcode);
}

错误模拟测试

错误模拟测试是验证错误处理逻辑有效性的关键手段，通过主动注入错误来测试系统的容错能力。

API错误模拟工具

// API错误模拟中间件
function errorSimulationMiddleware(req: Request, res: Response, next: NextFunction) {
  // 从查询参数获取模拟错误配置
  const { simulateError, errorCode, delay } = req.query;
  
  // 模拟延迟
  if (delay) {
    const delayMs = parseInt(delay as string);
    return setTimeout(() => handleErrorSimulation(), delayMs);
  }
  
  handleErrorSimulation();
  
  function handleErrorSimulation() {
    if (simulateError === 'true' && errorCode) {
      const code = parseInt(errorCode as string);
      const errorMessages: Record<number, string> = {
        -1000: '系统异常',
        -1001: '请求参数校验错误',
        -2000: '请求参数非法',
        -2002: 'Token已失效',
        -2005: '已有对话流正在输出'
      };
      
      return res.status(400).json({
        errcode: code,
        errmsg: errorMessages[code] || '模拟错误',
        data: null
      });
    }
    
    next();
  }
}

// 使用方式: 在API路由中添加中间件
router.post('/chat/completions', errorSimulationMiddleware, chatController);

测试用例设计

// API错误处理测试用例
describe('API错误处理测试', () => {
  test('当Token失效时应自动刷新并重试', async () => {
    // 模拟Token失效响应
    nock('https://api.kimi.moonshot.cn')
      .post('/v1/chat/completions')
      .reply(400, { errcode: -2002, errmsg: 'Token已失效' })
      .post('/v1/chat/completions') // 刷新Token后的第二次请求
      .reply(200, { id: 'test', object: 'chat.completion', choices: [] });
    
    // 模拟Token刷新接口
    nock('https://api.kimi.moonshot.cn')
      .post('/v1/token/refresh')
      .reply(200, { accessToken: 'new-token', expiresIn: 3600 });
    
    const result = await chatService.sendMessage('测试消息');
    expect(result).toHaveProperty('id');
  });
  
  test('并发请求冲突时应实现请求队列', async () => {
    // 模拟并发请求冲突
    nock('https://api.kimi.moonshot.cn')
      .post('/v1/chat/completions')
      .times(2)
      .reply(400, { errcode: -2005, errmsg: '已有对话流正在输出' });
    
    // 模拟第三次请求成功
    nock('https://api.kimi.moonshot.cn')
      .post('/v1/chat/completions')
      .reply(200, { id: 'test', object: 'chat.completion', choices: [] });
    
    // 同时发起两个请求
    const [result1, result2] = await Promise.all([
      chatService.sendMessage('测试1'),
      chatService.sendMessage('测试2')
    ]);
    
    // 验证两个请求都最终成功
    expect(result1).toHaveProperty('id');
    expect(result2).toHaveProperty('id');
  });
});

监控告警配置

建立完善的监控告警体系，可及时发现和响应API错误，防止问题扩大。

错误监控实现

// API错误监控模块
class ApiErrorMonitor {
  private errorStats: Record<number, { count: number; firstOccurrence: number; lastOccurrence: number }> = {};
  private alertThresholds: Record<number, { count: number; timeWindow: number }> = {
    -1000: { count: 5, timeWindow: 60 }, // 60秒内出现5次系统异常触发告警
    -2002: { count: 10, timeWindow: 300 }, // 300秒内出现10次Token失效触发告警
  };
  private alertCallback: (errorCode: number, stats: any) => void;
  
  constructor(alertCallback: (errorCode: number, stats: any) => void) {
    this.alertCallback = alertCallback;
  }
  
  recordError(errorCode: number): void {
    const now = Date.now();
    
    // 更新错误统计
    if (!this.errorStats[errorCode]) {
      this.errorStats[errorCode] = {
        count: 1,
        firstOccurrence: now,
        lastOccurrence: now
      };
    } else {
      this.errorStats[errorCode].count++;
      this.errorStats[errorCode].lastOccurrence = now;
    }
    
    // 检查是否达到告警阈值
    this.checkAlertThreshold(errorCode);
  }
  
  private checkAlertThreshold(errorCode: number): void {
    const threshold = this.alertThresholds[errorCode];
    if (!threshold) return;
    
    const stats = this.errorStats[errorCode];
    const timeElapsed = (stats.lastOccurrence - stats.firstOccurrence) / 1000;
    
    // 在时间窗口内达到错误数量阈值
    if (stats.count >= threshold.count && timeElapsed <= threshold.timeWindow) {
      this.alertCallback(errorCode, stats);
      
      // 重置统计，避免重复告警
      stats.count = 0;
      stats.firstOccurrence = Date.now();
    }
  }
  
  // 定期清理过期的错误统计
  startCleanupJob(interval = 3600000): void {
    setInterval(() => {
      const now = Date.now();
      for (const code in this.errorStats) {
        if (now - this.errorStats[code].lastOccurrence > interval) {
          delete this.errorStats[code];
        }
      }
    }, interval);
  }
}

// 使用示例
const monitor = new ApiErrorMonitor((code, stats) => {
  // 发送告警通知
  sendAlert(`API错误告警: 错误码 ${code}, 短时间内出现 ${stats.count} 次`);
});

// 在API错误处理中记录错误
try {
  // API调用
} catch (error) {
  monitor.recordError(error.errcode);
  throw error;
}

实施预防策略

预防策略是降低错误发生率的根本措施，通过规范化的开发流程和系统性的设计原则，从源头减少错误产生。

参数验证与请求规范

请求参数预验证

在发送API请求前进行本地参数验证，可有效减少无效请求和参数错误。

// 请求参数验证工具
import { z } from 'zod';

// 定义KIMI API请求schema
const ChatCompletionSchema = z.object({
  model: z.string().min(1, '模型名称不能为空'),
  messages: z.array(
    z.object({
      role: z.enum(['user', 'assistant', 'system']),
      content: z.string().min(1, '消息内容不能为空'),
      images: z.array(z.string().url()).optional()
    })
  ).min(1, '至少需要一条消息'),
  stream: z.boolean().optional().default(false),
  temperature: z.number().min(0).max(2).optional().default(0.7),
  max_tokens: z.number().int().positive().optional()
});

// 验证请求参数
function validateChatRequest(data: any): { valid: boolean; error?: string } {
  try {
    ChatCompletionSchema.parse(data);
    return { valid: true };
  } catch (error) {
    if (error instanceof z.ZodError) {
      return { 
        valid: false, 
        error: error.errors.map(e => `${e.path.join('.')}: ${e.message}`).join('; ') 
      };
    }
    return { valid: false, error: '参数验证失败' };
  }
}

// 使用示例
const requestData = {
  model: 'moonshot-v1-8k',
  messages: [{ role: 'user', content: '' }] // 空消息内容
};

const { valid, error } = validateChatRequest(requestData);
if (!valid) {
  console.error('请求参数错误:', error);
  // 处理参数错误
}

API请求封装

封装API请求客户端，统一处理认证、参数验证和错误处理。

// KIMI API客户端封装
class KimiApiClient {
  private apiKey: string;
  private baseUrl: string;
  private timeout: number;
  
  constructor(config: { apiKey: string; baseUrl?: string; timeout?: number }) {
    this.apiKey = config.apiKey;
    this.baseUrl = config.baseUrl || 'https://api.kimi.moonshot.cn/v1';
    this.timeout = config.timeout || 30000;
  }
  
  async chatCompletion(params: any): Promise<any> {
    // 参数预验证
    const { valid, error } = validateChatRequest(params);
    if (!valid) {
      throw new APIException(-1001, `请求参数错误: ${error}`);
    }
    
    try {
      const controller = new AbortController();
      const timeoutId = setTimeout(() => controller.abort(), this.timeout);
      
      const response = await fetch(`${this.baseUrl}/chat/completions`, {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json',
          'Authorization': `Bearer ${this.apiKey}`
        },
        body: JSON.stringify(params),
        signal: controller.signal
      });
      
      clearTimeout(timeoutId);
      
      const data = await response.json();
      
      // 处理API返回的错误
      if (!response.ok || data.errcode) {
        throw new APIException(
          data.errcode || -1000,
          data.errmsg || 'API请求失败',
          response.status
        );
      }
      
      return data;
    } catch (error) {
      if (error.name === 'AbortError') {
        throw new APIException(-1000, `请求超时(${this.timeout}ms)`);
      }
      if (error instanceof APIException) {
        throw error;
      }
      throw new APIException(-1000, `网络请求失败: ${error.message}`);
    }
  }
}

文档与测试体系

错误码文档自动生成

维护错误码文档并确保与代码同步更新，便于开发者查阅和使用。

// 错误码定义与文档生成
const ERROR_CODES = {
  SYSTEM_ERROR: {
    code: -1000,
    message: '系统异常',
    description: '服务器内部处理错误，通常由服务端逻辑缺陷或资源耗尽导致',
    solution: '请稍后重试，如持续出现请联系技术支持'
  },
  PARAM_VALIDATION_ERROR: {
    code: -1001,
    message: '请求参数校验错误',
    description: '请求格式不符合API规范，如字段类型错误或缺失',
    solution: '检查请求参数是否符合API文档要求，确保所有必填字段存在且格式正确'
  },
  // 其他错误码...
} as const;

// 生成错误码文档markdown
function generateErrorCodesDoc(): string {
  let markdown = '# KIMI API错误码参考\n\n';
  markdown += '## 系统级错误（-1000至-1099）\n\n';
  markdown += '| 错误码 | 错误信息 | 描述 | 解决方案 |\n';
  markdown += '|--------|----------|------|----------|\n';
  
  // 筛选并排序系统级错误码
  const systemErrors = Object.values(ERROR_CODES)
    .filter(e => e.code >= -1099 && e.code <= -1000)
    .sort((a, b) => a.code - b.code);
  
  systemErrors.forEach(error => {
    markdown += `| ${error.code} | ${error.message} | ${error.description} | ${error.solution} |\n`;
  });
  
  // API级错误码部分...
  
  return markdown;
}

// 自动生成错误码文档
fs.writeFileSync('docs/error-codes.md', generateErrorCodesDoc());

自动化测试覆盖

建立全面的测试覆盖，确保错误处理逻辑的正确性和稳定性。

// API错误处理完整测试套件
describe('KIMI API错误处理', () => {
  let apiClient: KimiApiClient;
  
  beforeEach(() => {
    apiClient = new KimiApiClient({
      apiKey: 'test-api-key',
      baseUrl: 'https://test-api.kimi.moonshot.cn/v1'
    });
  });
  
  describe('参数验证', () => {
    test('缺失必填参数应返回参数校验错误', async () => {
      await expect(apiClient.chatCompletion({
        // 缺少model参数
        messages: [{ role: 'user', content: '测试' }]
      })).rejects.toThrow(/请求参数错误/);
    });
    
    test('参数格式错误应返回参数校验错误', async () => {
      await expect(apiClient.chatCompletion({
        model: 'moonshot-v1-8k',
        messages: [{ role: 'invalid', content: '测试' }] // 无效的role值
      })).rejects.toThrow(/请求参数错误/);
    });
  });
  
  describe('API错误处理', () => {
    test('Token失效错误应正确捕获', async () => {
      nock('https://test-api.kimi.moonshot.cn')
        .post('/v1/chat/completions')
        .reply(400, { errcode: -2002, errmsg: 'Token已失效' });
      
      await expect(apiClient.chatCompletion({
        model: 'moonshot-v1-8k',
        messages: [{ role: 'user', content: '测试' }]
      })).rejects.toThrow(/Token已失效/);
    });
    
    test('并发请求冲突错误应正确捕获', async () => {
      nock('https://test-api.kimi.moonshot.cn')
        .post('/v1/chat/completions')
        .reply(400, { errcode: -2005, errmsg: '已有对话流正在输出' });
      
      await expect(apiClient.chatCompletion({
        model: 'moonshot-v1-8k',
        messages: [{ role: 'user', content: '测试' }]
      })).rejects.toThrow(/已有对话流正在输出/);
    });
  });
  
  describe('网络错误处理', () => {
    test('请求超时期望抛出超时错误', async () => {
      // 模拟无响应
      nock('https://test-api.kimi.moonshot.cn')
        .post('/v1/chat/completions')
        .delayConnection(35000) // 超过30秒超时时间
        .reply(200, {});
      
      await expect(apiClient.chatCompletion({
        model: 'moonshot-v1-8k',
        messages: [{ role: 'user', content: '测试' }]
      })).rejects.toThrow(/请求超时/);
    });
  });
});