|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# debate_tool_triggering_challenge.sh - AI Debate Tool Triggering Challenge |
| 4 | +# Tests that the AI Debate system properly collects and executes tool calls |
| 5 | +# Ensures tool_calls are not discarded and action indicators are present |
| 6 | + |
| 7 | +set -e |
| 8 | + |
| 9 | +# Colors |
| 10 | +RED='\033[0;31m' |
| 11 | +GREEN='\033[0;32m' |
| 12 | +YELLOW='\033[1;33m' |
| 13 | +BLUE='\033[0;34m' |
| 14 | +CYAN='\033[0;36m' |
| 15 | +NC='\033[0m' # No Color |
| 16 | + |
| 17 | +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| 18 | +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" |
| 19 | + |
| 20 | +# Default configuration |
| 21 | +HOST="${HELIXAGENT_HOST:-localhost}" |
| 22 | +PORT="${HELIXAGENT_PORT:-7061}" |
| 23 | +BASE_URL="http://${HOST}:${PORT}" |
| 24 | +RESULTS_DIR="${PROJECT_ROOT}/challenges/results/debate_tool_triggering/$(date +%Y%m%d_%H%M%S)" |
| 25 | + |
| 26 | +echo "" |
| 27 | +echo "======================================================================" |
| 28 | +echo " HELIXAGENT AI DEBATE TOOL TRIGGERING CHALLENGE" |
| 29 | +echo "======================================================================" |
| 30 | +echo "" |
| 31 | +echo -e "${CYAN}This challenge verifies that the AI Debate system:${NC}" |
| 32 | +echo " 1. Collects tool_calls from debate positions" |
| 33 | +echo " 2. Uses collected tool_calls in ACTION PHASE" |
| 34 | +echo " 3. Shows action indicators (<---, --->)" |
| 35 | +echo " 4. Does NOT discard tool_calls from LLM responses" |
| 36 | +echo "" |
| 37 | +echo "Host: $HOST" |
| 38 | +echo "Port: $PORT" |
| 39 | +echo "Results: $RESULTS_DIR" |
| 40 | +echo "" |
| 41 | + |
| 42 | +# Create results directory |
| 43 | +mkdir -p "$RESULTS_DIR/results" |
| 44 | + |
| 45 | +# Challenge tracking |
| 46 | +TOTAL_TESTS=0 |
| 47 | +PASSED_TESTS=0 |
| 48 | +FAILED_TESTS=0 |
| 49 | + |
| 50 | +record_result() { |
| 51 | + local test_name="$1" |
| 52 | + local status="$2" |
| 53 | + local details="$3" |
| 54 | + |
| 55 | + TOTAL_TESTS=$((TOTAL_TESTS + 1)) |
| 56 | + |
| 57 | + if [ "$status" == "pass" ]; then |
| 58 | + PASSED_TESTS=$((PASSED_TESTS + 1)) |
| 59 | + echo -e " ${GREEN}[PASS]${NC} $test_name" |
| 60 | + else |
| 61 | + FAILED_TESTS=$((FAILED_TESTS + 1)) |
| 62 | + echo -e " ${RED}[FAIL]${NC} $test_name" |
| 63 | + echo "$details" >> "$RESULTS_DIR/results/failures.txt" |
| 64 | + fi |
| 65 | +} |
| 66 | + |
| 67 | +echo "----------------------------------------------------------------------" |
| 68 | +echo "Phase 1: DebatePositionResponse Structure Tests" |
| 69 | +echo "----------------------------------------------------------------------" |
| 70 | + |
| 71 | +cd "$PROJECT_ROOT" |
| 72 | + |
| 73 | +# Test DebatePositionResponse struct |
| 74 | +echo -e "${BLUE}[RUN]${NC} Testing DebatePositionResponse struct..." |
| 75 | +if go test -v -run "TestDebatePositionResponse_Struct" ./internal/handlers/... > "$RESULTS_DIR/results/struct_test.txt" 2>&1; then |
| 76 | + record_result "DebatePositionResponse struct holds content and tool_calls" "pass" "" |
| 77 | +else |
| 78 | + record_result "DebatePositionResponse struct holds content and tool_calls" "fail" "$(cat $RESULTS_DIR/results/struct_test.txt)" |
| 79 | +fi |
| 80 | + |
| 81 | +# Test empty tool calls handling |
| 82 | +echo -e "${BLUE}[RUN]${NC} Testing empty tool calls handling..." |
| 83 | +if go test -v -run "TestDebatePositionResponse_EmptyToolCalls" ./internal/handlers/... > "$RESULTS_DIR/results/empty_test.txt" 2>&1; then |
| 84 | + record_result "Empty tool_calls handled correctly" "pass" "" |
| 85 | +else |
| 86 | + record_result "Empty tool_calls handled correctly" "fail" "$(cat $RESULTS_DIR/results/empty_test.txt)" |
| 87 | +fi |
| 88 | + |
| 89 | +# Test multiple tool calls |
| 90 | +echo -e "${BLUE}[RUN]${NC} Testing multiple tool calls handling..." |
| 91 | +if go test -v -run "TestDebatePositionResponse_MultipleToolCalls" ./internal/handlers/... > "$RESULTS_DIR/results/multiple_test.txt" 2>&1; then |
| 92 | + record_result "Multiple tool_calls preserved correctly" "pass" "" |
| 93 | +else |
| 94 | + record_result "Multiple tool_calls preserved correctly" "fail" "$(cat $RESULTS_DIR/results/multiple_test.txt)" |
| 95 | +fi |
| 96 | + |
| 97 | +# Test JSON serialization |
| 98 | +echo -e "${BLUE}[RUN]${NC} Testing JSON serialization..." |
| 99 | +if go test -v -run "TestDebatePositionResponse_JSONSerialization" ./internal/handlers/... > "$RESULTS_DIR/results/json_test.txt" 2>&1; then |
| 100 | + record_result "JSON serialization preserves tool_calls" "pass" "" |
| 101 | +else |
| 102 | + record_result "JSON serialization preserves tool_calls" "fail" "$(cat $RESULTS_DIR/results/json_test.txt)" |
| 103 | +fi |
| 104 | + |
| 105 | +echo "" |
| 106 | +echo "----------------------------------------------------------------------" |
| 107 | +echo "Phase 2: Tool Calls Collection Tests" |
| 108 | +echo "----------------------------------------------------------------------" |
| 109 | + |
| 110 | +# Test tool calls collection from debate |
| 111 | +echo -e "${BLUE}[RUN]${NC} Testing tool calls collection from debate positions..." |
| 112 | +if go test -v -run "TestToolCallsCollectionFromDebate" ./internal/handlers/... > "$RESULTS_DIR/results/collection_test.txt" 2>&1; then |
| 113 | + record_result "Tool calls collected from all debate positions" "pass" "" |
| 114 | +else |
| 115 | + record_result "Tool calls collected from all debate positions" "fail" "$(cat $RESULTS_DIR/results/collection_test.txt)" |
| 116 | +fi |
| 117 | + |
| 118 | +# Test debate tool calls integration |
| 119 | +echo -e "${BLUE}[RUN]${NC} Testing full debate tool calls integration..." |
| 120 | +if go test -v -run "TestDebateToolCallsIntegration" ./internal/handlers/... > "$RESULTS_DIR/results/integration_test.txt" 2>&1; then |
| 121 | + record_result "Full debate-to-action tool calls flow" "pass" "" |
| 122 | +else |
| 123 | + record_result "Full debate-to-action tool calls flow" "fail" "$(cat $RESULTS_DIR/results/integration_test.txt)" |
| 124 | +fi |
| 125 | + |
| 126 | +# Test tool calls not discarded |
| 127 | +echo -e "${BLUE}[RUN]${NC} Testing tool calls preservation..." |
| 128 | +if go test -v -run "TestToolCallsNotDiscarded" ./internal/handlers/... > "$RESULTS_DIR/results/preservation_test.txt" 2>&1; then |
| 129 | + record_result "Tool calls are NOT discarded" "pass" "" |
| 130 | +else |
| 131 | + record_result "Tool calls are NOT discarded" "fail" "$(cat $RESULTS_DIR/results/preservation_test.txt)" |
| 132 | +fi |
| 133 | + |
| 134 | +echo "" |
| 135 | +echo "----------------------------------------------------------------------" |
| 136 | +echo "Phase 3: Action Indicator Tests" |
| 137 | +echo "----------------------------------------------------------------------" |
| 138 | + |
| 139 | +# Test action indicator generation |
| 140 | +echo -e "${BLUE}[RUN]${NC} Testing action indicator generation..." |
| 141 | +if go test -v -run "TestActionIndicatorGeneration" ./internal/handlers/... > "$RESULTS_DIR/results/indicator_gen_test.txt" 2>&1; then |
| 142 | + record_result "Action indicators (<---, --->) generated correctly" "pass" "" |
| 143 | +else |
| 144 | + record_result "Action indicators (<---, --->) generated correctly" "fail" "$(cat $RESULTS_DIR/results/indicator_gen_test.txt)" |
| 145 | +fi |
| 146 | + |
| 147 | +# Test action indicator visibility |
| 148 | +echo -e "${BLUE}[RUN]${NC} Testing action indicator visibility..." |
| 149 | +if go test -v -run "TestActionIndicatorVisibility" ./internal/handlers/... > "$RESULTS_DIR/results/indicator_vis_test.txt" 2>&1; then |
| 150 | + record_result "All action indicators visible in output" "pass" "" |
| 151 | +else |
| 152 | + record_result "All action indicators visible in output" "fail" "$(cat $RESULTS_DIR/results/indicator_vis_test.txt)" |
| 153 | +fi |
| 154 | + |
| 155 | +echo "" |
| 156 | +echo "----------------------------------------------------------------------" |
| 157 | +echo "Phase 4: Tool Call Generation Tests" |
| 158 | +echo "----------------------------------------------------------------------" |
| 159 | + |
| 160 | +# Test tool call generation with tools |
| 161 | +echo -e "${BLUE}[RUN]${NC} Testing tool call generation with available tools..." |
| 162 | +if go test -v -run "TestGenerateActionToolCalls_WithTools" ./internal/handlers/... > "$RESULTS_DIR/results/gen_with_tools_test.txt" 2>&1; then |
| 163 | + record_result "Tool calls generated when tools available" "pass" "" |
| 164 | +else |
| 165 | + record_result "Tool calls generated when tools available" "fail" "$(cat $RESULTS_DIR/results/gen_with_tools_test.txt)" |
| 166 | +fi |
| 167 | + |
| 168 | +# Test tool call generation without tools |
| 169 | +echo -e "${BLUE}[RUN]${NC} Testing tool call generation without tools..." |
| 170 | +if go test -v -run "TestGenerateActionToolCalls_NoTools" ./internal/handlers/... > "$RESULTS_DIR/results/gen_no_tools_test.txt" 2>&1; then |
| 171 | + record_result "No tool calls generated when no tools available" "pass" "" |
| 172 | +else |
| 173 | + record_result "No tool calls generated when no tools available" "fail" "$(cat $RESULTS_DIR/results/gen_no_tools_test.txt)" |
| 174 | +fi |
| 175 | + |
| 176 | +# Test search query tool call generation |
| 177 | +echo -e "${BLUE}[RUN]${NC} Testing search query tool call generation..." |
| 178 | +if go test -v -run "TestGenerateActionToolCalls_SearchQuery" ./internal/handlers/... > "$RESULTS_DIR/results/gen_search_test.txt" 2>&1; then |
| 179 | + record_result "Grep tool calls generated for search queries" "pass" "" |
| 180 | +else |
| 181 | + record_result "Grep tool calls generated for search queries" "fail" "$(cat $RESULTS_DIR/results/gen_search_test.txt)" |
| 182 | +fi |
| 183 | + |
| 184 | +echo "" |
| 185 | +echo "----------------------------------------------------------------------" |
| 186 | +echo "Phase 5: Tool Call Structure Validation Tests" |
| 187 | +echo "----------------------------------------------------------------------" |
| 188 | + |
| 189 | +# Test streaming tool call field validation |
| 190 | +echo -e "${BLUE}[RUN]${NC} Testing streaming tool call field validation..." |
| 191 | +if go test -v -run "TestStreamingToolCall_FieldValidation" ./internal/handlers/... > "$RESULTS_DIR/results/field_validation_test.txt" 2>&1; then |
| 192 | + record_result "StreamingToolCall fields validated correctly" "pass" "" |
| 193 | +else |
| 194 | + record_result "StreamingToolCall fields validated correctly" "fail" "$(cat $RESULTS_DIR/results/field_validation_test.txt)" |
| 195 | +fi |
| 196 | + |
| 197 | +# Test tool call arguments validation |
| 198 | +echo -e "${BLUE}[RUN]${NC} Testing tool call arguments validation..." |
| 199 | +if go test -v -run "TestToolCallArgumentsValidation" ./internal/handlers/... > "$RESULTS_DIR/results/args_validation_test.txt" 2>&1; then |
| 200 | + record_result "Tool call arguments follow correct format (snake_case)" "pass" "" |
| 201 | +else |
| 202 | + record_result "Tool call arguments follow correct format (snake_case)" "fail" "$(cat $RESULTS_DIR/results/args_validation_test.txt)" |
| 203 | +fi |
| 204 | + |
| 205 | +echo "" |
| 206 | +echo "----------------------------------------------------------------------" |
| 207 | +echo "Phase 6: Code Compilation and Build Test" |
| 208 | +echo "----------------------------------------------------------------------" |
| 209 | + |
| 210 | +# Test that the code compiles |
| 211 | +echo -e "${BLUE}[RUN]${NC} Testing code compilation..." |
| 212 | +if go build -o /dev/null ./cmd/helixagent/ > "$RESULTS_DIR/results/build_test.txt" 2>&1; then |
| 213 | + record_result "HelixAgent compiles successfully with tool triggering changes" "pass" "" |
| 214 | +else |
| 215 | + record_result "HelixAgent compiles successfully with tool triggering changes" "fail" "$(cat $RESULTS_DIR/results/build_test.txt)" |
| 216 | +fi |
| 217 | + |
| 218 | +# Test handlers package compiles |
| 219 | +echo -e "${BLUE}[RUN]${NC} Testing handlers package compilation..." |
| 220 | +if go build -o /dev/null ./internal/handlers/ > "$RESULTS_DIR/results/handlers_build_test.txt" 2>&1; then |
| 221 | + record_result "Handlers package compiles with DebatePositionResponse" "pass" "" |
| 222 | +else |
| 223 | + record_result "Handlers package compiles with DebatePositionResponse" "fail" "$(cat $RESULTS_DIR/results/handlers_build_test.txt)" |
| 224 | +fi |
| 225 | + |
| 226 | +echo "" |
| 227 | +echo "======================================================================" |
| 228 | +echo " CHALLENGE RESULTS" |
| 229 | +echo "======================================================================" |
| 230 | +echo "" |
| 231 | +echo -e "Total Tests: ${BLUE}$TOTAL_TESTS${NC}" |
| 232 | +echo -e "Passed: ${GREEN}$PASSED_TESTS${NC}" |
| 233 | +echo -e "Failed: ${RED}$FAILED_TESTS${NC}" |
| 234 | +echo "" |
| 235 | + |
| 236 | +if [ $FAILED_TESTS -eq 0 ]; then |
| 237 | + echo -e "${GREEN}============================================${NC}" |
| 238 | + echo -e "${GREEN} ALL TESTS PASSED - CHALLENGE COMPLETE! ${NC}" |
| 239 | + echo -e "${GREEN}============================================${NC}" |
| 240 | + echo "" |
| 241 | + echo -e "${CYAN}Tool triggering in AI Debate is working correctly:${NC}" |
| 242 | + echo " - DebatePositionResponse returns content AND tool_calls" |
| 243 | + echo " - Tool calls collected from all 5 debate positions" |
| 244 | + echo " - ACTION PHASE uses collected tool_calls" |
| 245 | + echo " - Action indicators (<---, --->) are visible" |
| 246 | + echo " - Tool_calls are NOT discarded" |
| 247 | + echo "" |
| 248 | + # Save success summary |
| 249 | + cat > "$RESULTS_DIR/CHALLENGE_PASSED.txt" << EOF |
| 250 | +AI Debate Tool Triggering Challenge PASSED |
| 251 | +========================================== |
| 252 | +
|
| 253 | +Date: $(date) |
| 254 | +Total Tests: $TOTAL_TESTS |
| 255 | +Passed: $PASSED_TESTS |
| 256 | +Failed: $FAILED_TESTS |
| 257 | +
|
| 258 | +The AI Debate system correctly: |
| 259 | +1. Returns tool_calls from generateRealDebateResponse() |
| 260 | +2. Collects tool_calls from all debate positions |
| 261 | +3. Uses collected tool_calls in ACTION PHASE |
| 262 | +4. Shows action indicators (<---, --->) |
| 263 | +5. Does NOT discard tool_calls from LLM responses |
| 264 | +EOF |
| 265 | + exit 0 |
| 266 | +else |
| 267 | + echo -e "${RED}============================================${NC}" |
| 268 | + echo -e "${RED} CHALLENGE FAILED - $FAILED_TESTS TESTS FAILED ${NC}" |
| 269 | + echo -e "${RED}============================================${NC}" |
| 270 | + echo "" |
| 271 | + echo "See $RESULTS_DIR/results/failures.txt for details" |
| 272 | + exit 1 |
| 273 | +fi |
0 commit comments