Configuration

This page covers all configuration options for vllm-client.

Client Configuration

Basic Setup

#![allow(unused)]
fn main() {
use vllm_client::VllmClient;

let client = VllmClient::new("http://localhost:8000/v1");
}

Using the Builder Pattern

For more complex configurations, use the builder pattern:

#![allow(unused)]
fn main() {
use vllm_client::VllmClient;

let client = VllmClient::builder()
    .base_url("http://localhost:8000/v1")
    .api_key("your-api-key")
    .timeout_secs(120)
    .build();
}

Configuration Options

Base URL

The base URL of your vLLM server. This should include the /v1 path for OpenAI compatibility.

#![allow(unused)]
fn main() {
// Local development
let client = VllmClient::new("http://localhost:8000/v1");

// Remote server
let client = VllmClient::new("https://api.example.com/v1");

// With trailing slash (automatically normalized)
let client = VllmClient::new("http://localhost:8000/v1/");
// Equivalent to: "http://localhost:8000/v1"
}

API Key

If your vLLM server requires authentication, configure the API key:

#![allow(unused)]
fn main() {
// Using method chain
let client = VllmClient::new("http://localhost:8000/v1")
    .with_api_key("sk-your-api-key");

// Using builder
let client = VllmClient::builder()
    .base_url("http://localhost:8000/v1")
    .api_key("sk-your-api-key")
    .build();
}

The API key is sent as a Bearer token in the Authorization header.

Timeout

Configure the request timeout for long-running operations:

#![allow(unused)]
fn main() {
// Using method chain
let client = VllmClient::new("http://localhost:8000/v1")
    .timeout_secs(300); // 5 minutes

// Using builder
let client = VllmClient::builder()
    .base_url("http://localhost:8000/v1")
    .timeout_secs(300)
    .build();
}

Default timeout uses the underlying HTTP client's default (usually 30 seconds).

Request Configuration

When making requests, you can configure various parameters:

Model Selection

#![allow(unused)]
fn main() {
use vllm_client::{VllmClient, json};

let response = client.chat.completions().create()
    .model("Qwen/Qwen2.5-72B-Instruct")
    .messages(json!([{"role": "user", "content": "Hello!"}]))
    .send()
    .await?;
}

Sampling Parameters

#![allow(unused)]
fn main() {
let response = client.chat.completions().create()
    .model("Qwen/Qwen2.5-72B-Instruct")
    .messages(json!([{"role": "user", "content": "Hello!"}]))
    .temperature(0.7)      // 0.0 - 2.0
    .top_p(0.9)            // 0.0 - 1.0
    .top_k(50)             // vLLM extension
    .max_tokens(1024)      // Max output tokens
    .send()
    .await?;
}
ParameterTypeRangeDescription
temperaturef320.0 - 2.0Controls randomness. Higher = more random
top_pf320.0 - 1.0Nucleus sampling threshold
top_ki321+Top-K sampling (vLLM extension)
max_tokensu321+Maximum tokens to generate

Stop Sequences

#![allow(unused)]
fn main() {
use serde_json::json;

// Multiple stop sequences
let response = client.chat.completions().create()
    .model("Qwen/Qwen2.5-72B-Instruct")
    .messages(json!([{"role": "user", "content": "Hello!"}]))
    .stop(json!(["END", "STOP", "\n\n"]))
    .send()
    .await?;

// Single stop sequence
let response = client.chat.completions().create()
    .model("Qwen/Qwen2.5-72B-Instruct")
    .messages(json!([{"role": "user", "content": "Hello!"}]))
    .stop(json!("END"))
    .send()
    .await?;
}

Extra Parameters

vLLM supports additional parameters via the extra() method:

#![allow(unused)]
fn main() {
let response = client.chat.completions().create()
    .model("Qwen/Qwen2.5-72B-Instruct")
    .messages(json!([{"role": "user", "content": "Think about this"}]))
    .extra(json!({
        "chat_template_kwargs": {
            "think_mode": true
        },
        "reasoning_effort": "high"
    }))
    .send()
    .await?;
}

Environment Variables

You can use environment variables to configure the client:

#![allow(unused)]
fn main() {
use std::env;
use vllm_client::VllmClient;

let base_url = env::var("VLLM_BASE_URL")
    .unwrap_or_else(|_| "http://localhost:8000/v1".to_string());

let api_key = env::var("VLLM_API_KEY").ok();

let mut client_builder = VllmClient::builder()
    .base_url(&base_url);

if let Some(key) = api_key {
    client_builder = client_builder.api_key(&key);
}

let client = client_builder.build();
}
VariableDescriptionExample
VLLM_BASE_URLvLLM server URLhttp://localhost:8000/v1
VLLM_API_KEYAPI key (optional)sk-xxx
VLLM_TIMEOUTTimeout in seconds300

Best Practices

Reusing the Client

Create the client once and reuse it for multiple requests:

#![allow(unused)]
fn main() {
// Good: Reuse client
let client = VllmClient::new("http://localhost:8000/v1");

for prompt in prompts {
    let response = client.chat.completions().create()
        .model("Qwen/Qwen2.5-72B-Instruct")
        .messages(json!([{"role": "user", "content": prompt}]))
        .send()
        .await?;
}

// Avoid: Creating client for each request
for prompt in prompts {
    let client = VllmClient::new("http://localhost:8000/v1"); // Inefficient!
    // ...
}
}

Timeout Selection

Choose appropriate timeouts based on your use case:

Use CaseRecommended Timeout
Simple queries30 seconds
Complex reasoning2-5 minutes
Long document generation10+ minutes

Error Handling

Always handle errors appropriately:

#![allow(unused)]
fn main() {
use vllm_client::{VllmClient, VllmError};

match client.chat.completions().create()
    .model("Qwen/Qwen2.5-72B-Instruct")
    .messages(json!([{"role": "user", "content": "Hello!"}]))
    .send()
    .await
{
    Ok(response) => println!("{}", response.content.unwrap()),
    Err(VllmError::Timeout) => eprintln!("Request timed out"),
    Err(VllmError::ApiError { status_code, message, .. }) => {
        eprintln!("API error ({}): {}", status_code, message);
    }
    Err(e) => eprintln!("Error: {}", e),
}
}

Next Steps