How to write an Apache Arrow file

I'm trying to use the arrow crate to write some data I'm reading from another source into an arrow file and I'm a bit lost (I'm new to Rust as well). I see that there is a FileWriter type so I'm assuming I have to do something like this:

let mut writer = FileWriter::try_new(writer: W, schema)?;

FileWriter::write(writer, batch).finish(writer);

I've created my Schema and my RecordBatch but I'm unsure as to what writer: W should be. Do I just do File::create("myfile")?; and pass that into FileWriter::try_new for that first argument? I can't even find what the appropriate file extension is for an Apache Arrow file so I don't know what extension to include.

This is as far as I've gotten:

use arrow::array::Int64Array;
use arrow::datatypes::{ Schema, Field, DataType };
use arrow::record_batch::RecordBatch;
use arrow::ipc::writer::FileWriter;
use std::fs::File;
use std::io::{self, BufRead};
use std::path::Path;
use std::sync::Arc;

fn main() {
    if let Ok(lines) = read_lines("C:\\Users\\mthel\\.julia\\datadeps\\CPS 202012\\dec20pub.dat") {
        let mut hrhhid = Vec::new();
        let mut hrmonth = Vec::new();
        let mut hryear4 = Vec::new();
        let mut gestfips = Vec::new();
        let mut prtage = Vec::new();

        for line in lines {
            if let Ok(d) = line {
                hrhhid.push(str::parse::<i64>(&d[0..15]).expect("parse error (hrhhid)"));
                hrmonth.push(str::parse::<i64>(&d[15..17]).expect("parse error (hrmonth)"));
                hryear4.push(str::parse::<i64>(&d[17..21]).expect("parse error (hryear4)"));
                gestfips.push(str::parse::<i64>(&d[92..94]).expect("parse error (gestfips)"));
                prtage.push(str::parse::<i64>(&d[121..123].trim()).expect("parse error (prtage)"));
            }
        }

        let schema = Schema::new(vec![
            Field::new("hrhhid", DataType::Int64, false),
            Field::new("hrmonth", DataType::Int64, false),
            Field::new("hryear4", DataType::Int64, false),
            Field::new("gestfips", DataType::Int64, false),
            Field::new("prtage", DataType::Int64, false),
        ]);

        let hrhhid_array = Int64Array::from(hrhhid);
        let hrmonth_array = Int64Array::from(hrmonth);
        let hryear4_array = Int64Array::from(hryear4);
        let gestfips_array = Int64Array::from(gestfips);
        let prtage_array = Int64Array::from(prtage);

        let batch = RecordBatch::try_new(
            Arc::new(schema),
            vec![
                Arc::new(hrhhid_array),
                Arc::new(hrmonth_array),
                Arc::new(hryear4_array),
                Arc::new(gestfips_array),
                Arc::new(prtage_array),
            ]
        );   
    }
}

fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>> where P: AsRef<Path> {
    let file = File::open(filename)?;
    Ok(io::BufReader::new(file).lines())
}

This appears to work:

use arrow::array::{ Int32Array, Int64Array };
use arrow::datatypes::{ Schema, Field, DataType };
use arrow::record_batch::RecordBatch;
use arrow::ipc:: writer::FileWriter;
use std::fs::File;
use std::io::{ self, BufRead };
use std::path::Path;
use std::sync::Arc;

fn main() {
    if let Ok(lines) = read_lines("C:\\Users\\mthel\\.julia\\datadeps\\CPS 202012\\dec20pub.dat") {

        let mut hrhhid = Vec::new();
        let mut hrmonth = Vec::new();
        let mut hryear4 = Vec::new();
        let mut gestfips = Vec::new();
        let mut prtage = Vec::new();

        for line in lines {
            if let Ok(d) = line {
                hrhhid.push(str::parse::<i64>(&d[0..15]).expect("parse error (hrhhid)"));
                hrmonth.push(str::parse::<i32>(&d[15..17]).expect("parse error (hrmonth)"));
                hryear4.push(str::parse::<i32>(&d[17..21]).expect("parse error (hryear4)"));
                gestfips.push(str::parse::<i32>(&d[92..94]).expect("parse error (gestfips)"));
                prtage.push(str::parse::<i32>(&d[121..123].trim()).expect("parse error (prtage)"));
            }
        }

        let schema = Schema::new(vec![
            Field::new("hrhhid", DataType::Int64, false),
            Field::new("hrmonth", DataType::Int32, false),
            Field::new("hryear4", DataType::Int32, false),
            Field::new("gestfips", DataType::Int32, false),
            Field::new("prtage", DataType::Int32, false),
        ]);

        let hrhhid = Int64Array::from(hrhhid);
        let hrmonth = Int32Array::from(hrmonth);
        let hryear4 = Int32Array::from(hryear4);
        let gestfips = Int32Array::from(gestfips);
        let prtage = Int32Array::from(prtage);

        let batch = RecordBatch::try_new(
            Arc::new(Schema::new(vec![
                Field::new("hrhhid", DataType::Int64, false),
                Field::new("hrmonth", DataType::Int32, false),
                Field::new("hryear4", DataType::Int32, false),
                Field::new("gestfips", DataType::Int32, false),
                Field::new("prtage", DataType::Int32, false),
            ])),
            vec![
                Arc::new(hrhhid),
                Arc::new(hrmonth),
                Arc::new(hryear4),
                Arc::new(gestfips),
                Arc::new(prtage),
            ]
        ).expect("create batch error");

        let buffer = File::create("cps_arrow.arrow").expect("create file error");

        let mut writer = FileWriter::try_new(buffer, &schema).expect("create file writer error");

        writer.write(&batch).expect("write batch error");
        writer.finish().expect("finish write error");
    }
}

fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>> where P: AsRef<Path> {
    let file = File::open(filename)?;
    Ok(io::BufReader::new(file).lines())
}

This topic was automatically closed 90 days after the last reply. We invite you to open a new topic if you have further questions or comments.