Skip to content

Commit 7f2c203

Browse files
committed
style: format code with pre-commit
1 parent 7be8b1c commit 7f2c203

File tree

2 files changed

+51
-69
lines changed

2 files changed

+51
-69
lines changed

data_juicer/ops/mapper/generate_challenging_qa_mapper.py

Lines changed: 49 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
from data_juicer.utils.lazy_loader import LazyLoader
77
from data_juicer.utils.model_utils import get_model, prepare_model
88

9-
torch = LazyLoader('torch', 'torch')
10-
vllm = LazyLoader('vllm', 'vllm')
9+
torch = LazyLoader("torch", "torch")
10+
vllm = LazyLoader("vllm", "vllm")
1111

12-
OP_NAME = 'generate_challenging_qa_mapper'
12+
OP_NAME = "generate_challenging_qa_mapper"
1313

1414

1515
def retry_on_error(func, max_retries=5, delay=1):
@@ -28,7 +28,7 @@ def wrapper(*args, **kwargs):
2828
return func(*args, **kwargs)
2929
except Exception as e:
3030
retries += 1
31-
print(f'Error: {e}, retry {retries}/{max_retries}...')
31+
print(f"Error: {e}, retry {retries}/{max_retries}...")
3232
if retries >= max_retries:
3333
raise
3434
time.sleep(delay)
@@ -47,14 +47,16 @@ class GenerateChallengingQAMapper(Mapper):
4747
It helps AI models autonomously create high-quality, reasoning-focused ​​QA pairs​​ without human input.
4848
"""
4949

50-
_accelerator = 'cuda'
50+
_accelerator = "cuda"
5151

52-
def __init__(self,
53-
hf_model: str = 'Qwen/Qwen2.5-VL-7B-Instruct',
54-
category: str = 'Mathematical Reasoning',
55-
model_name: str = 'Qwen',
56-
*args,
57-
**kwargs):
52+
def __init__(
53+
self,
54+
hf_model: str = "Qwen/Qwen2.5-VL-7B-Instruct",
55+
category: str = "Mathematical Reasoning",
56+
model_name: str = "Qwen",
57+
*args,
58+
**kwargs,
59+
):
5860
"""
5961
Initialization method.
6062
@@ -99,8 +101,7 @@ def __init__(self,
99101
"""
100102
super().__init__(*args, **kwargs)
101103
self.hf_model = hf_model
102-
self.model_key = prepare_model(model_type='huggingface',
103-
pretrained_model_name_or_path=hf_model)
104+
self.model_key = prepare_model(model_type="huggingface", pretrained_model_name_or_path=hf_model)
104105
self.category = category
105106
self.model_name = model_name
106107
self.system_prompt = system_prompt
@@ -111,18 +112,14 @@ def __init__(self,
111112

112113
# tensor_parallel_size = torch.cuda.device_count()
113114
model_params = {}
114-
model_params['tensor_parallel_size'] = 4
115-
self.model_key = prepare_model(model_type='vllm',
116-
pretrained_model_name_or_path=hf_model,
117-
**model_params)
118-
self.sampling_params = vllm.SamplingParams(temperature=0.9,
119-
top_p=0.95,
120-
top_k=40,
121-
repetition_penalty=1.1,
122-
max_tokens=2048)
115+
model_params["tensor_parallel_size"] = 4
116+
self.model_key = prepare_model(model_type="vllm", pretrained_model_name_or_path=hf_model, **model_params)
117+
self.sampling_params = vllm.SamplingParams(
118+
temperature=0.9, top_p=0.95, top_k=40, repetition_penalty=1.1, max_tokens=2048
119+
)
123120

124121
def extract_json(self, text):
125-
pattern = r'```json\s*(\{.*?\})\s*```'
122+
pattern = r"```json\s*(\{.*?\})\s*```"
126123

127124
match = re.search(pattern, text, re.DOTALL)
128125

@@ -132,71 +129,55 @@ def extract_json(self, text):
132129
json_data = json.loads(json_str)
133130
return json_data
134131
except json.JSONDecodeError as e:
135-
print(f'JSON parse error: {e}')
132+
print(f"JSON parse error: {e}")
136133
return None
137134
else:
138-
print('None of valid JSON data')
135+
print("None of valid JSON data")
139136
return None
140137

141138
@retry_on_error
142139
def process_single(self, sample=None, rank=None):
143140

144141
if self.category is None:
145-
print(
146-
'This OP requires processing multiple fields, and you need to specify valid `category`'
147-
)
142+
print("This OP requires processing multiple fields, and you need to specify valid `category`")
148143

149144
model, _ = get_model(self.model_key, rank, self.use_cuda())
150145

151-
messages = [{
152-
'role':
153-
'system',
154-
'content':
155-
self.system_prompt.replace('Qwen', self.model_name)
156-
}, {
157-
'role':
158-
'user',
159-
'content':
160-
self.user_prompt_background.format(category=self.category).replace(
161-
'Qwen', self.model_name)
162-
}]
146+
messages = [
147+
{"role": "system", "content": self.system_prompt.replace("Qwen", self.model_name)},
148+
{
149+
"role": "user",
150+
"content": self.user_prompt_background.format(category=self.category).replace("Qwen", self.model_name),
151+
},
152+
]
163153
background = model.chat(messages, self.sampling_params)
164154

165-
messages.extend([{
166-
'role': 'system',
167-
'content': background[0].outputs[0].text
168-
}, {
169-
'role':
170-
'user',
171-
'content':
172-
self.user_prompt_subquestion.replace('Qwen', self.model_name)
173-
}])
155+
messages.extend(
156+
[
157+
{"role": "system", "content": background[0].outputs[0].text},
158+
{"role": "user", "content": self.user_prompt_subquestion.replace("Qwen", self.model_name)},
159+
]
160+
)
174161
sub_questions = model.chat(messages, self.sampling_params)
175162

176-
messages.extend([{
177-
'role': 'system',
178-
'content': sub_questions[0].outputs[0].text
179-
}, {
180-
'role':
181-
'user',
182-
'content':
183-
self.user_prompt_multihop.replace('Qwen', self.model_name)
184-
}])
163+
messages.extend(
164+
[
165+
{"role": "system", "content": sub_questions[0].outputs[0].text},
166+
{"role": "user", "content": self.user_prompt_multihop.replace("Qwen", self.model_name)},
167+
]
168+
)
185169
multihop = model.chat(messages, self.sampling_params)
186170

187-
messages.extend([{
188-
'role': 'system',
189-
'content': multihop[0].outputs[0].text
190-
}, {
191-
'role':
192-
'user',
193-
'content':
194-
self.extract_prompt_qa.replace('Qwen', self.model_name)
195-
}])
171+
messages.extend(
172+
[
173+
{"role": "system", "content": multihop[0].outputs[0].text},
174+
{"role": "user", "content": self.extract_prompt_qa.replace("Qwen", self.model_name)},
175+
]
176+
)
196177
qa = model.chat(messages, self.sampling_params)
197178

198179
qa = self.extract_json(qa[0].outputs[0].text)
199-
qa['thinking'] = multihop[0].outputs[0].text
180+
qa["thinking"] = multihop[0].outputs[0].text
200181

201182
sample.clear()
202183
sample.update(qa)

docs/Operators.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ Data-Juicer 中的算子分为以下 7 种类型。
4646
| [filter](#filter) | 54 | Filters out low-quality samples. 过滤低质量样本。 |
4747
| [formatter](#formatter) | 8 | Discovers, loads, and canonicalizes source data. 发现、加载、规范化原始数据。 |
4848
| [grouper](#grouper) | 3 | Group samples to batched samples. 将样本分组,每一组组成一个批量样本。 |
49-
| [mapper](#mapper) | 87 | Edits and transforms samples. 对数据样本进行编辑和转换。 |
49+
| [mapper](#mapper) | 88 | Edits and transforms samples. 对数据样本进行编辑和转换。 |
5050
| [selector](#selector) | 5 | Selects top samples based on ranking. 基于排序选取高质量样本。 |
5151

5252
All the specific operators are listed below, each featured with several capability tags.
@@ -205,6 +205,7 @@ All the specific operators are listed below, each featured with several capabili
205205
| extract_support_text_mapper | 🔤Text 💻CPU 🔗API 🟢Stable | Extracts a supporting sub-text from the original text based on a given summary. 根据给定的摘要从原始文本中提取支持子文本。 | [info](operators/mapper/extract_support_text_mapper.md) | - |
206206
| extract_tables_from_html_mapper | 🔤Text 💻CPU 🟡Beta | Extracts tables from HTML content and stores them in a specified field. 从HTML内容中提取表并将其存储在指定字段中。 | [info](operators/mapper/extract_tables_from_html_mapper.md) | - |
207207
| fix_unicode_mapper | 🔤Text 💻CPU 🟢Stable | Fixes unicode errors in text samples. 修复文本示例中的unicode错误。 | [info](operators/mapper/fix_unicode_mapper.md) | - |
208+
| generate_challenging_qa_mapper | 🚀GPU 🧩HF 🌊vLLM 🟡Beta | Mapper to generate self-challenging question and answer pairs. 映射器生成自我挑战的问题和答案对。 | - | - |
208209
| generate_qa_from_examples_mapper | 🚀GPU 🌊vLLM 🧩HF 🟢Stable | Generates question and answer pairs from examples using a Hugging Face model. 使用拥抱面部模型从示例生成问题和答案对。 | [info](operators/mapper/generate_qa_from_examples_mapper.md) | - |
209210
| generate_qa_from_text_mapper | 🔤Text 🚀GPU 🌊vLLM 🧩HF 🟢Stable | Generates question and answer pairs from text using a specified model. 使用指定的模型从文本生成问题和答案对。 | [info](operators/mapper/generate_qa_from_text_mapper.md) | - |
210211
| image_blur_mapper | 🏞Image 💻CPU 🟢Stable | Blurs images in the dataset with a specified probability and blur type. 使用指定的概率和模糊类型对数据集中的图像进行模糊处理。 | [info](operators/mapper/image_blur_mapper.md) | - |

0 commit comments

Comments
 (0)