Getting ValueError' with RagEvaluatorPack in llamaindex and gpt-3.5-turbo

I am getting ValueError when performing RagEvaluatorPack in llama-index with ragas. Below is the code

judge_llm = OpenAI(temperature=0, model="gpt-3.5-turbo")

RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack")
rag_evaluator = RagEvaluatorPack(
    query_engine=query_engine,
    rag_dataset=rag_dataset,  # defined in 1A
    judge_llm=judge_llm,
    show_progress=True,
)

benchmark_df = await rag_evaluator.arun(
    batch_size=2,
    sleep_time_in_seconds=60,
)

Below is the stack trace

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[23], line 30
     16 rag_evaluator = RagEvaluatorPack(
     17     query_engine=query_engine,
     18     rag_dataset=rag_dataset,  # defined in 1A
     19     judge_llm=judge_llm,
     20     show_progress=True,
     21 )
     23 ############################################################################
     24 # NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
     25 # then you'll need to use different batch_size and sleep_time_in_seconds.  #
     26 # For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
     27 # and sleep_time_in_seconds=15 (as of December 2023.)                      #
     28 ############################################################################
---> 30 benchmark_df = await rag_evaluator.arun(
     31     batch_size=2,  # batches the number of openai api calls to make
     32     sleep_time_in_seconds=60,  # seconds to sleep before making an api call
     33 )

File D:\documents\github\infinitejoy_courses\creating-gpt-chatbots-for-enterprise-useca-vt9QSr1Q-py3.10\lib\site-packages\llama_index\packs\rag_evaluator\base.py:442, in RagEvaluatorPack.arun(self, batch_size, sleep_time_in_seconds)
    440 # which is heavily rate-limited
    441 eval_batch_size = int(max(batch_size / 4, 1))
--> 442 return await self._amake_evaluations(
    443     batch_size=eval_batch_size, sleep_time_in_seconds=eval_sleep_time_in_seconds
    444 )

File D:\documents\github\infinitejoy_courses\creating-gpt-chatbots-for-enterprise-useca-vt9QSr1Q-py3.10\lib\site-packages\llama_index\packs\rag_evaluator\base.py:366, in RagEvaluatorPack._amake_evaluations(self, batch_size, sleep_time_in_seconds)
    364 # do this in batches to avoid RateLimitError
    365 try:
--> 366     eval_results: List[EvaluationResult] = await asyncio.gather(*tasks)
    367 except RateLimitError as err:
    368     if self.show_progress:

File D:\ProgramData\miniconda3\lib\asyncio\tasks.py:304, in Task.__wakeup(self, future)
    302 def __wakeup(self, future):
    303     try:
--> 304         future.result()
    305     except BaseException as exc:
    306         # This may also be a cancellation.
    307         self.__step(exc)

File D:\ProgramData\miniconda3\lib\asyncio\tasks.py:232, in Task.__step(***failed resolving arguments***)
    228 try:
    229     if exc is None:
    230         # We use the `send` method directly, because coroutines
    231         # don't have `__iter__` and `__next__` methods.
--> 232         result = coro.send(None)
    233     else:
    234         result = coro.throw(exc)

File ...\lib\site-packages\llama_index\core\evaluation\correctness.py:146, in CorrectnessEvaluator.aevaluate(***failed resolving arguments***)
    138 eval_response = await self._llm.apredict(
    139     prompt=self._eval_template,
    140     query=query,
    141     generated_answer=response,
    142     reference_answer=reference or "(NO REFERENCE ANSWER SUPPLIED)",
    143 )
    145 # Use the parser function
--> 146 score, reasoning = self.parser_function(eval_response)
    148 return EvaluationResult(
    149     query=query,
    150     response=response,
   (...)
    153     feedback=reasoning,
    154 )

File ...lib\site-packages\llama_index\core\evaluation\eval_utils.py:183, in default_parser(eval_response)
    173 """
    174 Default parser function for evaluation response.
    175 
   (...)
    180     Tuple[float, str]: A tuple containing the score as a float and the reasoning as a string.
    181 """
    182 score_str, reasoning_str = eval_response.split("\n", 1)
--> 183 score = float(score_str)
    184 reasoning = reasoning_str.lstrip("\n")
    185 return score, reasoning

ValueError: could not convert string to float: ''

Below are the dependencies

python = ">=3.10,<3.12"
streamlit = "^1.31.1"
llama-index = "^0.10.9"
llama-index-embeddings-huggingface = "^0.1.1"
llama-index-llms-ollama = "^0.1.1"
ragas = "^0.1.2"
spacy = "^3.7.4"