diff --git a/docs/source/de/peft.md b/docs/source/de/peft.md index bdc0684d79..eda8ce9435 100644 --- a/docs/source/de/peft.md +++ b/docs/source/de/peft.md @@ -86,10 +86,10 @@ model.load_adapter(peft_model_id) Die `bitsandbytes`-Integration unterstützt Datentypen mit 8bit und 4bit Genauigkeit, was für das Laden großer Modelle nützlich ist, weil es Speicher spart (lesen Sie den `bitsandbytes`-Integrations [guide](./quantization#bitsandbytes-integration), um mehr zu erfahren). Fügen Sie die Parameter `load_in_8bit` oder `load_in_4bit` zu [`~PreTrainedModel.from_pretrained`] hinzu und setzen Sie `device_map="auto"`, um das Modell effektiv auf Ihre Hardware zu verteilen: ```py -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig peft_model_id = "ybelkada/opt-350m-lora" -model = AutoModelForCausalLM.from_pretrained(peft_model_id, device_map="auto", load_in_8bit=True) +model = AutoModelForCausalLM.from_pretrained(peft_model_id, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) ``` ## Einen neuen Adapter hinzufügen diff --git a/docs/source/en/peft.md b/docs/source/en/peft.md index 9e2ac805b2..01fe2fad87 100644 --- a/docs/source/en/peft.md +++ b/docs/source/en/peft.md @@ -88,10 +88,10 @@ Check out the [API documentation](#transformers.integrations.PeftAdapterMixin) s The `bitsandbytes` integration supports 8bit and 4bit precision data types, which are useful for loading large models because it saves memory (see the `bitsandbytes` integration [guide](./quantization#bitsandbytes-integration) to learn more). Add the `load_in_8bit` or `load_in_4bit` parameters to [`~PreTrainedModel.from_pretrained`] and set `device_map="auto"` to effectively distribute the model to your hardware: ```py -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig peft_model_id = "ybelkada/opt-350m-lora" -model = AutoModelForCausalLM.from_pretrained(peft_model_id, device_map="auto", load_in_8bit=True) +model = AutoModelForCausalLM.from_pretrained(peft_model_id, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) ``` ## Add a new adapter diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 41a5d09a0d..a81fc49381 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -354,20 +354,20 @@ If you're curious and interested in learning more about the concepts underlying To load a model in 8-bit for inference, use the `load_in_8bit` parameter. The `device_map` parameter is optional, but we recommend setting it to `"auto"` to allow 🤗 Accelerate to automatically and efficiently allocate the model given the available resources in the environment: ```py -from transformers import AutoModelForCausalLM +from transformers import AutoModelForCausalLM, BitsAndBytesConfig model_name = "bigscience/bloom-2b5" -model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) ``` If you're loading a model in 8-bit for text generation, you should use the [`~transformers.GenerationMixin.generate`] method instead of the [`Pipeline`] function which is not optimized for 8-bit models and will be slower. Some sampling strategies, like nucleus sampling, are also not supported by the [`Pipeline`] for 8-bit models. You should also place all inputs on the same device as the model: ```py -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig model_name = "bigscience/bloom-2b5" tokenizer = AutoTokenizer.from_pretrained(model_name) -model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) prompt = "Hello, my llama is cute" inputs = tokenizer(prompt, return_tensors="pt").to("cuda") diff --git a/docs/source/it/perf_infer_gpu_one.md b/docs/source/it/perf_infer_gpu_one.md index 16f77b3b1f..e618ec34a1 100644 --- a/docs/source/it/perf_infer_gpu_one.md +++ b/docs/source/it/perf_infer_gpu_one.md @@ -55,10 +55,10 @@ Di seguito sono riportate alcune note per aiutarvi a utilizzare questo modulo, o Dopo aver installato le librerie necessarie, per caricare il tuo modello mixed 8-bit è il seguente: ```py -from transformers import AutoModelForCausalLM +from transformers import AutoModelForCausalLM, BitsAndBytesConfig model_name = "bigscience/bloom-2b5" -model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) ``` Per la generazione di testo, si consiglia di: @@ -69,11 +69,11 @@ Per la generazione di testo, si consiglia di: Ecco un semplice esempio: ```py -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig model_name = "bigscience/bloom-2b5" tokenizer = AutoTokenizer.from_pretrained(model_name) -model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) text = "Hello, my llama is cute" inputs = tokenizer(prompt, return_tensors="pt").to("cuda") @@ -87,7 +87,7 @@ outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) Usare il seguente modo caricare il modello mixed-8bit su più GPU (stesso comando della configurazione a GPU singola): ```py model_name = "bigscience/bloom-2b5" -model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) ``` Puoi controllare la RAM della GPU che si vuole allocare su ogni GPU usando `accelerate`. Utilizzare l'argomento `max_memory` come segue: diff --git a/docs/source/ja/main_classes/quantization.md b/docs/source/ja/main_classes/quantization.md index 3af3130a84..a93d06b257 100644 --- a/docs/source/ja/main_classes/quantization.md +++ b/docs/source/ja/main_classes/quantization.md @@ -245,12 +245,12 @@ model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_i ```python # pip install transformers accelerate bitsandbytes -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig model_id = "bigscience/bloom-1b7" tokenizer = AutoTokenizer.from_pretrained(model_id) -model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_8bit=True) +model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) ``` 次に、通常 [`PreTrainedModel`] を使用するのと同じようにモデルを使用します。 @@ -321,9 +321,9 @@ model_double_quant = AutoModelForCausalLM.from_pretrained(model_id, quantization この機能を使用できるようにするには、必ず `bitsandbytes>0.37.2` を使用してください (この記事の執筆時点では、`bitsandbytes==0.38.0.post1` でテストしました)。 ```python -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig -model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", device_map="auto", load_in_8bit=True) +model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", quantization_config=BitsAndBytesConfig(load_in_8bit=True)) tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m") model.push_to_hub("bloom-560m-8bit") diff --git a/docs/source/ja/peft.md b/docs/source/ja/peft.md index 5cc687f70b..c3d195adbd 100644 --- a/docs/source/ja/peft.md +++ b/docs/source/ja/peft.md @@ -91,10 +91,10 @@ model.load_adapter(peft_model_id) `bitsandbytes` 統合は、8ビットおよび4ビットの精度データ型をサポートしており、大規模なモデルを読み込む際にメモリを節約するのに役立ちます(詳細については `bitsandbytes` 統合の[ガイド](./quantization#bitsandbytes-integration)を参照してください)。[`~PreTrainedModel.from_pretrained`] に `load_in_8bit` または `load_in_4bit` パラメータを追加し、`device_map="auto"` を設定してモデルを効果的にハードウェアに分散配置できます: ```py -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig peft_model_id = "ybelkada/opt-350m-lora" -model = AutoModelForCausalLM.from_pretrained(peft_model_id, device_map="auto", load_in_8bit=True) +model = AutoModelForCausalLM.from_pretrained(peft_model_id, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) ``` ## Add a new adapter diff --git a/docs/source/ja/perf_infer_gpu_one.md b/docs/source/ja/perf_infer_gpu_one.md index 6d7466e022..d6a9b30916 100644 --- a/docs/source/ja/perf_infer_gpu_one.md +++ b/docs/source/ja/perf_infer_gpu_one.md @@ -357,10 +357,10 @@ Int8混合精度行列分解は、行列乗算を2つのストリームに分割 必要なライブラリをインストールした後、ミックス 8 ビットモデルを読み込む方法は次の通りです: ```py -from transformers import AutoModelForCausalLM +from transformers import AutoModelForCausalLM, BitsAndBytesConfig model_name = "bigscience/bloom-2b5" -model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) ``` 以下はシンプルな例です: @@ -370,11 +370,11 @@ model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", ```py -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig model_name = "bigscience/bloom-2b5" tokenizer = AutoTokenizer.from_pretrained(model_name) -model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) prompt = "Hello, my llama is cute" inputs = tokenizer(prompt, return_tensors="pt").to("cuda") @@ -388,7 +388,7 @@ outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) ```py model_name = "bigscience/bloom-2b5" -model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) ``` `accelerate`を使用して各GPUに割り当てるGPU RAMを制御する際には、以下のように`max_memory`引数を使用します: diff --git a/docs/source/ko/peft.md b/docs/source/ko/peft.md index 90327e62c2..d4ef0ba539 100644 --- a/docs/source/ko/peft.md +++ b/docs/source/ko/peft.md @@ -86,10 +86,10 @@ model.load_adapter(peft_model_id) `bitsandbytes` 통합은 8비트와 4비트 정밀도 데이터 유형을 지원하므로 큰 모델을 가져올 때 유용하면서 메모리도 절약합니다. 모델을 하드웨어에 효과적으로 분배하려면 [`~PreTrainedModel.from_pretrained`]에 `load_in_8bit` 또는 `load_in_4bit` 매개변수를 추가하고 `device_map="auto"`를 설정하세요: ```py -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig peft_model_id = "ybelkada/opt-350m-lora" -model = AutoModelForCausalLM.from_pretrained(peft_model_id, device_map="auto", load_in_8bit=True) +model = AutoModelForCausalLM.from_pretrained(peft_model_id, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) ``` ## 새 어댑터 추가 [[add-a-new-adapter]] diff --git a/docs/source/ko/perf_infer_gpu_one.md b/docs/source/ko/perf_infer_gpu_one.md index 73cef858b9..d6ddca6cd0 100644 --- a/docs/source/ko/perf_infer_gpu_one.md +++ b/docs/source/ko/perf_infer_gpu_one.md @@ -127,10 +127,10 @@ Int8 혼합 정밀도 행렬 분해는 행렬 곱셈을 두 개의 스트림으 필요한 라이브러리를 설치한 후 혼합 8비트 모델을 가져오는 방법은 다음과 같습니다: ```py -from transformers import AutoModelForCausalLM +from transformers import AutoModelForCausalLM, BitsAndBytesConfig model_name = "bigscience/bloom-2b5" -model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) ``` 텍스트 생성의 경우: @@ -141,11 +141,11 @@ model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", 다음은 간단한 예입니다: ```py -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig model_name = "bigscience/bloom-2b5" tokenizer = AutoTokenizer.from_pretrained(model_name) -model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) prompt = "Hello, my llama is cute" inputs = tokenizer(prompt, return_tensors="pt").to("cuda") @@ -159,7 +159,7 @@ outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) 다중 GPU에서 혼합 8비트 모델을 로드하는 방법은 단일 GPU 설정과 동일합니다(동일한 명령어 사용): ```py model_name = "bigscience/bloom-2b5" -model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) ``` 하지만 `accelerate`를 사용하여 각 GPU에 할당할 GPU RAM을 제어할 수 있습니다. 다음과 같이 `max_memory` 인수를 사용하세요: diff --git a/docs/source/zh/main_classes/quantization.md b/docs/source/zh/main_classes/quantization.md index 3c7e4d9212..d303906a99 100644 --- a/docs/source/zh/main_classes/quantization.md +++ b/docs/source/zh/main_classes/quantization.md @@ -360,12 +360,12 @@ model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_i ```python # pip install transformers accelerate bitsandbytes -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig model_id = "bigscience/bloom-1b7" tokenizer = AutoTokenizer.from_pretrained(model_id) -model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_8bit=True) +model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) ``` 然后,像通常使用 `PreTrainedModel` 一样使用您的模型。 @@ -441,9 +441,9 @@ model_double_quant = AutoModelForCausalLM.from_pretrained(model_id, quantization ```python -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig -model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", device_map="auto", load_in_8bit=True) +model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", quantization_config=BitsAndBytesConfig(load_in_8bit=True)) tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m") model.push_to_hub("bloom-560m-8bit") diff --git a/docs/source/zh/peft.md b/docs/source/zh/peft.md index 4241a15c00..de7ae6d155 100644 --- a/docs/source/zh/peft.md +++ b/docs/source/zh/peft.md @@ -86,10 +86,10 @@ model.load_adapter(peft_model_id) `bitsandbytes`集成支持8bit和4bit精度数据类型,这对于加载大模型非常有用,因为它可以节省内存(请参阅`bitsandbytes`[指南](./quantization#bitsandbytes-integration)以了解更多信息)。要有效地将模型分配到您的硬件,请在[`~PreTrainedModel.from_pretrained`]中添加`load_in_8bit`或`load_in_4bit`参数,并将`device_map="auto"`设置为: ```py -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig peft_model_id = "ybelkada/opt-350m-lora" -model = AutoModelForCausalLM.from_pretrained(peft_model_id, device_map="auto", load_in_8bit=True) +model = AutoModelForCausalLM.from_pretrained(peft_model_id, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) ``` ## 添加新的adapter