작성
·
496
1
안녕하세요! 좋은 강의 잘 듣고 있습니다.
제공해주신 연습 예제부터 차근차근 실습해보고 더 큰 데이터로 학습 해보고자 컬럼수 10개 / 큰 용량의 csv파일로 학습중입니다.
변수만 바꾸고 코드를 동일하고 학습하고 있습니다.
코드의 마지막 줄인
results = avg_by_count.collect() print(results)
이 부분에서 에러가 나서 어떻게 해결해야 하는지 질문드립니다.
===
이것저것 검색해보다가 pyspark / python 버전이 안맞으면 에러가 날 수 있다고 보고 버전도 확인해봤습니다.
print(sc.version)
# 3.3.2
print(sc.pythonVer)
#3 .10
print(sc.master)
#local[*]
아래는 에러 전문입니다.
1197 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
1198 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
File /usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py:1321, in JavaMember.__call__(self, *args)
1315 command = proto.CALL_COMMAND_NAME +\
1316 self.command_header +\
1317 args_command +\
1318 proto.END_COMMAND_PART
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1324 for temp_arg in temp_args:
1325 temp_arg._detach()
File /usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/protocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
332 format(target_id, ".", name, value))
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 10.0 failed 1 times, most recent failure: Lost task 1.0 in stage 10.0 (TID 11) (b4b9f5895184 executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 686, in main
process()
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 676, in process
out_iter = func(split_index, iterator)
File "/usr/local/spark/python/pyspark/rdd.py", line 3472, in pipeline_func
return func(split, prev_func(split, iterator))
File "/usr/local/spark/python/pyspark/rdd.py", line 3472, in pipeline_func
return func(split, prev_func(split, iterator))
File "/usr/local/spark/python/pyspark/rdd.py", line 540, in func
return f(iterator)
File "/usr/local/spark/python/pyspark/rdd.py", line 2554, in combineLocally
merger.mergeValues(iterator)
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/shuffle.py", line 253, in mergeValues
for k, v in iterator:
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/util.py", line 81, in wrapper
return f(*args, **kwargs)
File "/tmp/ipykernel_35939/1438163465.py", line 11, in parse_line
ValueError: invalid literal for int() with base 10: '61.760999927297242'
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:552)
at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:758)
at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:740)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:505)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1211)
at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1217)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:136)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
at java.base/java.lang.Thread.run(Thread.java:833)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2238)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2259)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2278)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1021)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
at org.apache.spark.rdd.RDD.collect(RDD.scala:1020)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:180)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:568)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 686, in main
process()
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 676, in process
out_iter = func(split_index, iterator)
File "/usr/local/spark/python/pyspark/rdd.py", line 3472, in pipeline_func
return func(split, prev_func(split, iterator))
File "/usr/local/spark/python/pyspark/rdd.py", line 3472, in pipeline_func
return func(split, prev_func(split, iterator))
File "/usr/local/spark/python/pyspark/rdd.py", line 540, in func
return f(iterator)
File "/usr/local/spark/python/pyspark/rdd.py", line 2554, in combineLocally
merger.mergeValues(iterator)
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/shuffle.py", line 253, in mergeValues
for k, v in iterator:
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/util.py", line 81, in wrapper
return f(*args, **kwargs)
File "/tmp/ipykernel_35939/1438163465.py", line 11, in parse_line
ValueError: invalid literal for int() with base 10: '61.760999927297242'
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:552)
at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:758)
at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:740)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:505)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1211)
at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1217)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:136)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
... 1 more
ㅠㅠ혹시 해결방법을 아신다면 답변 부탁드립니다..
감사합니다.!!!!
답변 1
1
안녕하세요 Communication님,
에러로 보기에는 데이터에 문제가 있어보입니다.
ValueError: invalid literal for int() with base 10: '61.760999927297242'
문제는 61.760999927297242를 Integer로 변환을 할 수 없다고 불평하는 것 같네요.
스택 오버 플로우에도 자세한 내용이 있습니다 :D
https://stackoverflow.com/questions/1841565/valueerror-invalid-literal-for-int-with-base-10
그럼 도움이 되었길 바라겠습니다!