From 3d94c257e6b97927c7ac3cb78f49d42c260668c5 Mon Sep 17 00:00:00 2001
From: Hugo SIMON <hugo.simon@dgfip.finances.gouv.fr>
Date: Wed, 3 Jul 2024 07:54:24 +0200
Subject: [PATCH] feat: Update makefile to the correct Ollama behaviour

---
 Makefile                |  2 +-
 api/app/ds/ai_models.py |  8 +++++---
 doc/Ollama.md           | 12 +++++++-----
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/Makefile b/Makefile
index 1f787f1..e98dfd0 100644
--- a/Makefile
+++ b/Makefile
@@ -86,7 +86,7 @@ endif
 
 ifeq ($(use_ollama), 1)
 	@ollama serve & \
-	ollama run $(LLM_OLLAMA) & \
+	ollama pull $(LLM_OLLAMA) & \
 	ollama pull $(EMBEDDING_OLLAMA) &
 endif
 
diff --git a/api/app/ds/ai_models.py b/api/app/ds/ai_models.py
index 1817464..58672ce 100644
--- a/api/app/ds/ai_models.py
+++ b/api/app/ds/ai_models.py
@@ -9,6 +9,8 @@ from llama_index.llms.ollama import Ollama
 from llama_index.embeddings.ollama import OllamaEmbedding
 import os
 
+OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL")
+
 class CustomOpenAIEmbedding(BaseEmbedding):
     """Embedding model class for LlamaIndex
 
@@ -82,7 +84,7 @@ def get_llm_model(client_type : str, model: str, max_tokens : int, temperature :
     elif client_type == "openai":
         return OpenAI(model=model, max_tokens=max_tokens, temperature=temperature, top_p=top_p, timeout=600)
     elif client_type == "ollama":
-        return Ollama(model="mistral", base_url = os.getenv("OLLAMA_BASE_URL"), max_tokens=max_tokens, temperature=temperature, top_p=top_p, request_timeout=120.0)
+        return Ollama(model="mistral", base_url = OLLAMA_BASE_URL, max_tokens=max_tokens, temperature=temperature, top_p=top_p, request_timeout=120.0)
     else :
         return "Please provide a right client type"
     
@@ -108,7 +110,7 @@ def get_embedding_model(client_type : str, model: str):
     elif client_type == "ollama":
         return OllamaEmbedding(
                 model_name=model,
-                base_url=os.getenv("OLLAMA_BASE_URL"),
+                base_url=OLLAMA_BASE_URL,
                 ollama_additional_kwargs={"mirostat": 0},
                 )
     else :
@@ -143,7 +145,7 @@ def llm_batch_inference(prompts : List[str], client_type : str, model : str, max
         prompt=prompts, model=model, max_tokens=max_tokens, temperature=temperature, top_p=top_p
     ).choices]
     elif client_type == "ollama" :
-        llm = get_llm_model(client_type=os.getenv('LLM_CLIENT_TYPE'), model=model,max_tokens=max_tokens, temperature=temperature, top_p=top_p )
+        llm = get_llm_model(client_type=client_type, model=model,max_tokens=max_tokens, temperature=temperature, top_p=top_p )
         output = []
         for prompt in prompts :
             output.append(llm.complete(prompt).text)
diff --git a/doc/Ollama.md b/doc/Ollama.md
index b1f192e..021dbae 100644
--- a/doc/Ollama.md
+++ b/doc/Ollama.md
@@ -12,14 +12,16 @@ DÃ¨s lors, nous configurerons Ollama afin qu'il Ã©coute sur tous les ports rÃ©se
 launchctl setenv OLLAMA_HOST 0.0.0.0:8080 
 ```
 
-Pour tÃ©lÃ©charger et lancer un modÃ¨le LLM, nous ferons par exemple :
+La commande suivante permet de lancer le serveur Ollama :
 
 ```bash
-ollama run mistral
+ollama serve
 ```
 
-Pour un modÃ¨le d'embedding :
+Pour tÃ©lÃ©charger un modÃ¨le, nous ferons par exemple :
 
 ```bash
-ollama pull mxbai-embed-large
-```
\ No newline at end of file
+ollama pull mistral
+```
+
+Une fois tÃ©lÃ©chargÃ©, le serveur Ollama lancera les modÃ¨les en mÃ©moire en fonction des requÃªtes reÃ§ues par les utilisateurs. 
-- 
GitLab