hace 3 meses · 161170a67d
--- a/external/packages/bsp/rk3588/usr/bin/rknn_server
+++ b/external/packages/bsp/rk3588/usr/bin/rknn_server
--- a/external/packages/bsp/rk3588/usr/include/rkllm.h
+++ b/external/packages/bsp/rk3588/usr/include/rkllm.h
@@ -1,119 +1,271 @@
 
				-#ifndef _LLM_H_
			
 
				-#define _LLM_H_
			
 
				+#ifndef _RKLLM_H_
			
 
				+#define _RKLLM_H_
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 extern "C" {
			
 
				 #endif
			
 
				 
			
 
				-typedef void* LLMHandle;        /* Handle for an instance of a language model. */
			
 
				+/**
			
 
				+ * @typedef LLMHandle
			
 
				+ * @brief A handle used to manage and interact with the large language model.
			
 
				+ */
			
 
				+typedef void* LLMHandle;
			
 
				 
			
 
				 /**
			
 
				- * @brief Structure for possible states of an inference call.
			
 
				- * 
			
 
				+ * @enum LLMCallState
			
 
				+ * @brief Describes the possible states of an LLM call.
			
 
				  */
			
 
				 typedef enum {
			
 
				-    LLM_RUN_NORMAL = 0,         /* Inference status is normal and inference has not yet finished. */
			
 
				-    LLM_RUN_FINISH = 1,         /* Inference status is normal and inference has finished. */
			
 
				-    LLM_RUN_ERROR = 2           /* Inference status is abnormal. */
			
 
				+    RKLLM_RUN_NORMAL  = 0, /**< The LLM call is in a normal running state. */
			
 
				+    RKLLM_RUN_WAITING = 1, /**< The LLM call is waiting for complete UTF-8 encoded character. */
			
 
				+    RKLLM_RUN_FINISH  = 2, /**< The LLM call has finished execution. */
			
 
				+    RKLLM_RUN_ERROR   = 3, /**< An error occurred during the LLM call. */
			
 
				+    RKLLM_RUN_GET_LAST_HIDDEN_LAYER = 4 /**< Retrieve the last hidden layer during inference. */
			
 
				 } LLMCallState;
			
 
				 
			
 
				 /**
			
 
				- * @brief Structure for setting up parameters for the language model
			
 
				- * 
			
 
				+ * @enum RKLLMInputType
			
 
				+ * @brief Defines the types of inputs that can be fed into the LLM.
			
 
				+ */
			
 
				+typedef enum {
			
 
				+    RKLLM_INPUT_PROMPT      = 0, /**< Input is a text prompt. */
			
 
				+    RKLLM_INPUT_TOKEN       = 1, /**< Input is a sequence of tokens. */
			
 
				+    RKLLM_INPUT_EMBED       = 2, /**< Input is an embedding vector. */
			
 
				+    RKLLM_INPUT_MULTIMODAL  = 3, /**< Input is multimodal (e.g., text and image). */
			
 
				+} RKLLMInputType;
			
 
				+
			
 
				+/**
			
 
				+ * @enum RKLLMInferMode
			
 
				+ * @brief Specifies the inference modes of the LLM.
			
 
				+ */
			
 
				+typedef enum {
			
 
				+    RKLLM_INFER_GENERATE                    = 0, /**< The LLM generates text based on input. */
			
 
				+    RKLLM_INFER_GET_LAST_HIDDEN_LAYER       = 1, /**< The LLM retrieves the last hidden layer for further processing. */
			
 
				+} RKLLMInferMode;
			
 
				+
			
 
				+/**
			
 
				+ * @struct RKLLMExtendParam
			
 
				+ * @brief The extend parameters for configuring an LLM instance.
			
 
				+ */
			
 
				+typedef struct {
			
 
				+    int32_t      base_domain_id;   /**< base_domain_id */
			
 
				+    uint8_t      reserved[112];    /**< reserved */
			
 
				+} RKLLMExtendParam;
			
 
				+
			
 
				+/**
			
 
				+ * @struct RKLLMParam
			
 
				+ * @brief Defines the parameters for configuring an LLM instance.
			
 
				  */
			
 
				 typedef struct {
			
 
				-    const char* model_path;     /* Path where the model file is located. */
			
 
				-    int32_t num_npu_core;       /* Number of NPU cores used for model inference. */
			
 
				-    int32_t max_context_len;    /* Maximum size of the context. */
			
 
				-    int32_t max_new_tokens;     /* Maximum number of tokens to generate during model inference. */
			
 
				-    int32_t top_k;              /* The number of highest probability tokens to consider for generation. */
			
 
				-    float top_p;                /* Nucleus sampling: cumulative probability cutoff to use for token selection. */
			
 
				-    float temperature;          /* Hyperparameter to control the randomness of predictions by scaling the logits before applying softmax. */
			
 
				-    float repeat_penalty;       /* Penalty applied to the logits of previously generated tokens, helps prevent repetitive or monotonic text. */
			
 
				-    float frequency_penalty;    /* Penalty for repeating the same word or phrase, reducing the likelihood of repeated content. */
			
 
				-    float presence_penalty;     /* Penalty or reward for introducing new tokens into the generated text. */
			
 
				-    int32_t mirostat;           /* Enables mirostat algorithm, where 0 = off, 1 = use mirostat algorithm, 2 = use mirostat 2.0 algorithm. */
			
 
				-    float mirostat_tau;         /* Target entropy (perplexity) for mirostat algorithm, setting the desired complexity of the generated text. */
			
 
				-    float mirostat_eta;         /* Learning rate for the mirostat algorithm. */
			
 
				-    bool logprobs;              /* Whether to return the log probabilities for each output token along with their token ids. */
			
 
				-    int32_t top_logprobs;       /* The number of top tokens for which to return log probabilities, along with their token ids. */
			
 
				-    bool use_gpu;               /* Flag to indicate whether to use GPU for inference. */
			
 
				+    const char* model_path;         /**< Path to the model file. */
			
 
				+    int32_t max_context_len;        /**< Maximum number of tokens in the context window. */
			
 
				+    int32_t max_new_tokens;         /**< Maximum number of new tokens to generate. */
			
 
				+    int32_t top_k;                  /**< Top-K sampling parameter for token generation. */
			
 
				+    float top_p;                    /**< Top-P (nucleus) sampling parameter. */
			
 
				+    float temperature;              /**< Sampling temperature, affecting the randomness of token selection. */
			
 
				+    float repeat_penalty;           /**< Penalty for repeating tokens in generation. */
			
 
				+    float frequency_penalty;        /**< Penalizes frequent tokens during generation. */
			
 
				+    float presence_penalty;         /**< Penalizes tokens based on their presence in the input. */
			
 
				+    int32_t mirostat;               /**< Mirostat sampling strategy flag (0 to disable). */
			
 
				+    float mirostat_tau;             /**< Tau parameter for Mirostat sampling. */
			
 
				+    float mirostat_eta;             /**< Eta parameter for Mirostat sampling. */
			
 
				+    bool skip_special_token;        /**< Whether to skip special tokens during generation. */
			
 
				+    bool is_async;                  /**< Whether to run inference asynchronously. */
			
 
				+    const char* img_start;          /**< Starting position of an image in multimodal input. */
			
 
				+    const char* img_end;            /**< Ending position of an image in multimodal input. */
			
 
				+    const char* img_content;        /**< Pointer to the image content. */
			
 
				+    RKLLMExtendParam extend_param; /**< Extend parameters. */
			
 
				 } RKLLMParam;
			
 
				 
			
 
				 /**
			
 
				- * @brief Structure representing a token with its associated log probability.
			
 
				- * 
			
 
				+ * @struct RKLLMLoraAdapter
			
 
				+ * @brief Defines parameters for a Lora adapter used in model fine-tuning.
			
 
				+ */
			
 
				+typedef struct {
			
 
				+    const char* lora_adapter_path; /**< Path to the Lora adapter file. */
			
 
				+    const char* lora_adapter_name; /**< Name of the Lora adapter. */
			
 
				+    float scale;                   /**< Scaling factor for applying the Lora adapter. */
			
 
				+} RKLLMLoraAdapter;
			
 
				+
			
 
				+/**
			
 
				+ * @struct RKLLMEmbedInput
			
 
				+ * @brief Represents an embedding input to the LLM.
			
 
				+ */
			
 
				+typedef struct {
			
 
				+    float* embed;      /**< Pointer to the embedding vector (of size n_tokens * n_embed). */
			
 
				+    size_t n_tokens;   /**< Number of tokens represented in the embedding. */
			
 
				+} RKLLMEmbedInput;
			
 
				+
			
 
				+/**
			
 
				+ * @struct RKLLMTokenInput
			
 
				+ * @brief Represents token input to the LLM.
			
 
				+ */
			
 
				+typedef struct {
			
 
				+    int32_t* input_ids; /**< Array of token IDs. */
			
 
				+    size_t n_tokens;    /**< Number of tokens in the input. */
			
 
				+} RKLLMTokenInput;
			
 
				+
			
 
				+/**
			
 
				+ * @struct RKLLMMultiModelInput
			
 
				+ * @brief Represents multimodal input (e.g., text and image).
			
 
				+ */
			
 
				+typedef struct {
			
 
				+    char* prompt;           /**< Text prompt input. */
			
 
				+    float* image_embed;     /**< Embedding of the image (of size n_image_tokens * n_image_embed). */
			
 
				+    size_t n_image_tokens;  /**< Number of image tokens. */
			
 
				+} RKLLMMultiModelInput;
			
 
				+
			
 
				+/**
			
 
				+ * @struct RKLLMInput
			
 
				+ * @brief Represents different types of input to the LLM via a union.
			
 
				+ */
			
 
				+typedef struct {
			
 
				+    RKLLMInputType input_type; /**< Specifies the type of input provided (e.g., prompt, token, embed, multimodal). */
			
 
				+    union {
			
 
				+        const char* prompt_input;               /**< Text prompt input if input_type is RKLLM_INPUT_PROMPT. */
			
 
				+        RKLLMEmbedInput embed_input;            /**< Embedding input if input_type is RKLLM_INPUT_EMBED. */
			
 
				+        RKLLMTokenInput token_input;            /**< Token input if input_type is RKLLM_INPUT_TOKEN. */
			
 
				+        RKLLMMultiModelInput multimodal_input;  /**< Multimodal input if input_type is RKLLM_INPUT_MULTIMODAL. */
			
 
				+    };
			
 
				+} RKLLMInput;
			
 
				+
			
 
				+/**
			
 
				+ * @struct RKLLMLoraParam
			
 
				+ * @brief Structure defining parameters for Lora adapters.
			
 
				+ */
			
 
				+typedef struct {
			
 
				+    const char* lora_adapter_name; /**< Name of the Lora adapter. */
			
 
				+} RKLLMLoraParam;
			
 
				+
			
 
				+/**
			
 
				+ * @struct RKLLMPromptCacheParam
			
 
				+ * @brief Structure to define parameters for caching prompts.
			
 
				+ */
			
 
				+typedef struct {
			
 
				+    int save_prompt_cache;          /**< Flag to indicate whether to save the prompt cache (0 = don't save, 1 = save). */
			
 
				+    const char* prompt_cache_path;  /**< Path to the prompt cache file. */
			
 
				+} RKLLMPromptCacheParam;
			
 
				+
			
 
				+/**
			
 
				+ * @struct RKLLMInferParam
			
 
				+ * @brief Structure for defining parameters during inference.
			
 
				+ */
			
 
				+typedef struct {
			
 
				+    RKLLMInferMode mode;                    /**< Inference mode (e.g., generate or get last hidden layer). */
			
 
				+    RKLLMLoraParam* lora_params;            /**< Pointer to Lora adapter parameters. */
			
 
				+    RKLLMPromptCacheParam* prompt_cache_params; /**< Pointer to prompt cache parameters. */
			
 
				+} RKLLMInferParam;
			
 
				+
			
 
				+/**
			
 
				+ * @struct RKLLMResultLastHiddenLayer
			
 
				+ * @brief Structure to hold the hidden states from the last layer.
			
 
				  */
			
 
				 typedef struct {
			
 
				-    float logprob;              /* Log probability corresponding to the token ID. */
			
 
				-    int id;                     /* Token ID. */
			
 
				-} Token;
			
 
				+    const float* hidden_states; /**< Pointer to the hidden states (of size num_tokens * embd_size). */
			
 
				+    int embd_size;              /**< Size of the embedding vector. */
			
 
				+    int num_tokens;             /**< Number of tokens for which hidden states are stored. */
			
 
				+} RKLLMResultLastHiddenLayer;
			
 
				 
			
 
				 /**
			
 
				- * @brief Structure to hold the results from the language model inference, including text and token details.
			
 
				- * 
			
 
				+ * @struct RKLLMResult
			
 
				+ * @brief Structure to represent the result of LLM inference.
			
 
				  */
			
 
				 typedef struct {
			
 
				-    const char* text;           /* Decoded text from the inference output. */
			
 
				-    Token* tokens;              /* Array of Token structures, each containing a log probability and a token ID. */
			
 
				-    int num;                    /* Number of top tokens returned, typically those with the highest probabilities. */
			
 
				+    const char* text;                        /**< Generated text result. */
			
 
				+    int32_t token_id;                        /**< ID of the generated token. */
			
 
				+    RKLLMResultLastHiddenLayer last_hidden_layer; /**< Hidden states of the last layer (if requested). */
			
 
				 } RKLLMResult;
			
 
				 
			
 
				 /**
			
 
				- * @brief Callback function for handling inference results.
			
 
				- * 
			
 
				- * @param result A pointer to an RKLLMResult struct containing the inference results.
			
 
				- * @param userdata A pointer to user-defined function or null if no user function was provided.
			
 
				- * @param state The state of the inference process, indicating success, failure, or completion.
			
 
				+ * @typedef LLMResultCallback
			
 
				+ * @brief Callback function to handle LLM results.
			
 
				+ * @param result Pointer to the LLM result.
			
 
				+ * @param userdata Pointer to user data for the callback.
			
 
				+ * @param state State of the LLM call (e.g., finished, error).
			
 
				  */
			
 
				 typedef void(*LLMResultCallback)(RKLLMResult* result, void* userdata, LLMCallState state);
			
 
				 
			
 
				 /**
			
 
				- * @brief Initializes RKLLMParam with default settings.
			
 
				- * 
			
 
				- * @return RKLLMParam An RKLLMParam struct with default values set.
			
 
				+ * @brief Creates a default RKLLMParam structure with preset values.
			
 
				+ * @return A default RKLLMParam structure.
			
 
				  */
			
 
				 RKLLMParam rkllm_createDefaultParam();
			
 
				 
			
 
				 /**
			
 
				- * @brief Initializes the model with specified parameters.
			
 
				- * 
			
 
				- * @param handle Pointer to a handle for the language model, which will be initialized by this function.
			
 
				- * @param param An RKLLMParam struct containing all the parameters needed for the model.
			
 
				- * @param callback A function pointer to the callback that handles the results of the inference.
			
 
				- * @return int Returns 0 on success, or a negative error code on failure.
			
 
				+ * @brief Initializes the LLM with the given parameters.
			
 
				+ * @param handle Pointer to the LLM handle.
			
 
				+ * @param param Configuration parameters for the LLM.
			
 
				+ * @param callback Callback function to handle LLM results.
			
 
				+ * @return Status code (0 for success, non-zero for failure).
			
 
				  */
			
 
				-int rkllm_init(LLMHandle* handle, RKLLMParam param, LLMResultCallback callback);
			
 
				+int rkllm_init(LLMHandle* handle, RKLLMParam* param, LLMResultCallback callback);
			
 
				 
			
 
				 /**
			
 
				- * @brief Releases the model resources.
			
 
				- * 
			
 
				- * @param handle The handle to the language model to be destroyed.
			
 
				- * @return int Returns 0 on successful release, or a negative error code if an error occurs.
			
 
				+ * @brief Loads a Lora adapter into the LLM.
			
 
				+ * @param handle LLM handle.
			
 
				+ * @param lora_adapter Pointer to the Lora adapter structure.
			
 
				+ * @return Status code (0 for success, non-zero for failure).
			
 
				+ */
			
 
				+int rkllm_load_lora(LLMHandle handle, RKLLMLoraAdapter* lora_adapter);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Loads a prompt cache from a file.
			
 
				+ * @param handle LLM handle.
			
 
				+ * @param prompt_cache_path Path to the prompt cache file.
			
 
				+ * @return Status code (0 for success, non-zero for failure).
			
 
				+ */
			
 
				+int rkllm_load_prompt_cache(LLMHandle handle, const char* prompt_cache_path);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Releases the prompt cache from memory.
			
 
				+ * @param handle LLM handle.
			
 
				+ * @return Status code (0 for success, non-zero for failure).
			
 
				+ */
			
 
				+int rkllm_release_prompt_cache(LLMHandle handle);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Destroys the LLM instance and releases resources.
			
 
				+ * @param handle LLM handle.
			
 
				+ * @return Status code (0 for success, non-zero for failure).
			
 
				  */
			
 
				 int rkllm_destroy(LLMHandle handle);
			
 
				 
			
 
				 /**
			
 
				- * @brief Runs model inference on the given prompt.
			
 
				- * 
			
 
				- * @param handle The handle to the initialized language model.
			
 
				- * @param prompt The text prompt on which to perform inference.
			
 
				- * @param userdata Optional user-defined function that will be passed to the callback.
			
 
				- * @return int Returns 0 on success, or a negative error code if an error occurs during inference.
			
 
				+ * @brief Runs an LLM inference task synchronously.
			
 
				+ * @param handle LLM handle.
			
 
				+ * @param rkllm_input Input data for the LLM.
			
 
				+ * @param rkllm_infer_params Parameters for the inference task.
			
 
				+ * @param userdata Pointer to user data for the callback.
			
 
				+ * @return Status code (0 for success, non-zero for failure).
			
 
				  */
			
 
				-int rkllm_run(LLMHandle handle, const char* prompt, void* userdata);
			
 
				+int rkllm_run(LLMHandle handle, RKLLMInput* rkllm_input, RKLLMInferParam* rkllm_infer_params, void* userdata);
			
 
				 
			
 
				 /**
			
 
				- * @brief Aborts the current inference process.
			
 
				- * 
			
 
				- * @param handle The handle to the language model whose inference is to be aborted.
			
 
				- * @return int Returns 0 if the process is successfully aborted, or a negative error code
			
 
				- *         if no process was running or if the abort fails.
			
 
				+ * @brief Runs an LLM inference task asynchronously.
			
 
				+ * @param handle LLM handle.
			
 
				+ * @param rkllm_input Input data for the LLM.
			
 
				+ * @param rkllm_infer_params Parameters for the inference task.
			
 
				+ * @param userdata Pointer to user data for the callback.
			
 
				+ * @return Status code (0 for success, non-zero for failure).
			
 
				+ */
			
 
				+int rkllm_run_async(LLMHandle handle, RKLLMInput* rkllm_input, RKLLMInferParam* rkllm_infer_params, void* userdata);
			
 
				+
			
 
				+/**
			
 
				+ * @brief Aborts an ongoing LLM task.
			
 
				+ * @param handle LLM handle.
			
 
				+ * @return Status code (0 for success, non-zero for failure).
			
 
				  */
			
 
				 int rkllm_abort(LLMHandle handle);
			
 
				 
			
 
				+/**
			
 
				+ * @brief Checks if an LLM task is currently running.
			
 
				+ * @param handle LLM handle.
			
 
				+ * @return Status code (0 if a task is running, non-zero for otherwise).
			
 
				+ */
			
 
				+int rkllm_is_running(LLMHandle handle);
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				-} //extern "C"
			
 
				+}
			
 
				 #endif
			
 
				 
			
 
				-#endif
			
 
				+#endif
			
--- a/external/packages/bsp/rk3588/usr/lib/librkllmrt.so
+++ b/external/packages/bsp/rk3588/usr/lib/librkllmrt.so
--- a/external/packages/bsp/rk3588/usr/lib/librknnrt.so
+++ b/external/packages/bsp/rk3588/usr/lib/librknnrt.so