llm-ls icon indicating copy to clipboard operation
llm-ls copied to clipboard

Can't process response from llamacpp server

Open gergap opened this issue 8 months ago • 2 comments

I have nvim/llm working with ollama, which uses llm-ls-x86_64-unknown-linux-gnu-0.5.3. I tried to switch the config to use OpenAI API to connect to llamacpp server, because this does support my AMD GPU, which ollama does not.

I can see in Wireshark that the request is sent and the llamacppserver send back a successful response. However I don' get any completion in nvim, which is probably caused by llm-ls processing the response.

I get this error in nvim: [LLM] serde json error: data did not match any variant of untagged enum OpenAIAPIResponse

request:

{
  "model": "models/codellama-7b.Q4_K_M.gguf",
  "options": {
    "temperature": 0.2,
    "top_p": 0.95
  },
  "parameters": {
    "max_new_tokens": 60,
    "temperature": 0.2,
    "top_p": 0.95
  },
  "prompt": "<PRE> #include <stdio.h>\n\nfloat multiply(float a, float b)\n{\n     <SUF>\n}\n\nint main(int argc, char *argv[])\n{\n    return 0;\n}\n\n\n <MID>",
  "stream": false
}

response:

{
  "content": "return a * b; <EOT>",
  "id_slot": 0,
  "stop": true,
  "model": "models/codellama-7b.Q4_K_M.gguf",
  "tokens_predicted": 6,
  "tokens_evaluated": 54,
  "generation_settings": {
    "n_ctx": 512,
    "n_predict": -1,
    "model": "models/codellama-7b.Q4_K_M.gguf",
    "seed": 4294967295,
    "temperature": 0.800000011920929,
    "dynatemp_range": 0,
    "dynatemp_exponent": 1,
    "top_k": 40,
    "top_p": 0.949999988079071,
    "min_p": 0.05000000074505806,
    "tfs_z": 1,
    "typical_p": 1,
    "repeat_last_n": 64,
    "repeat_penalty": 1,
    "presence_penalty": 0,
    "frequency_penalty": 0,
    "penalty_prompt_tokens": [],
    "use_penalty_prompt_tokens": false,
    "mirostat": 0,
    "mirostat_tau": 5,
    "mirostat_eta": 0.10000000149011612,
    "penalize_nl": false,
    "stop": [],
    "n_keep": 0,
    "n_discard": 0,
    "ignore_eos": false,
    "stream": false,
    "logit_bias": [],
    "n_probs": 0,
    "min_keep": 0,
    "grammar": "",
    "samplers": [
      "top_k",
      "tfs_z",
      "typical_p",
      "top_p",
      "min_p",
      "temperature"
    ]
  },
  "prompt": "<PRE> #include <stdio.h>\n\nfloat multiply(float a, float b)\n{\n     <SUF>\n}\n\nint main(int argc, char *argv[])\n{\n    return 0;\n}\n\n\n <MID>",
  "truncated": false,
  "stopped_eos": true,
  "stopped_word": false,
  "stopped_limit": false,
  "stopping_word": "",
  "tokens_cached": 59,
  "timings": {
    "prompt_n": 54,
    "prompt_ms": 601.562,
    "prompt_per_token_ms": 11.140037037037038,
    "prompt_per_second": 89.76630837719138,
    "predicted_n": 6,
    "predicted_ms": 315.451,
    "predicted_per_token_ms": 52.57516666666667,
    "predicted_per_second": 19.020386684461293
  }
}

I hope you can fix that or tell me what I did wrong if its my fault.

gergap avatar May 30 '24 18:05 gergap