ぶろぐ

日記です

ltsv形式のログをs3に格納するようにtd-agentを設定した

ltsvでs3にとりあえず上げた。後でhiveでもsparkでもいくらでも集計できるように！
Redshiftに直接取り込むならtsvかjsonじゃないとダメみたいだけど…とりあえずltsvで。
scala使う機会がなさすぎるので簡単な集計でも無駄にspark使いたい・・・。
複数のログがある場合、fluent-plugin-forest を使うと設定をDRYにできてすばらすかった。

インストール

# fluentd install
curl -L https://toolbelt.treasuredata.com/sh/install-redhat-td-agent2.sh | sh

# start
sudo service td-agent start

# plugin install
/opt/td-agent/embedded/bin/fluent-gem install fluent-plugin-s3
/opt/td-agent/embedded/bin/fluent-gem install fluent-plugin-forest

td-agent.conf

<source>
  @type     tail
  format    none
  time_key  time
  path      /var/log/xxx/app/xxx-api.app.log
  pos_file  /var/log/xxx/app/xxx-api.app.log.pos
  tag       s3.app
</source>

<source>
  @type     tail
  format    ltsv
  time_key  time
  path      /var/log/xxx/access/xxx-api.access.log
  pos_file  /var/log/xxx/access/xxx-api.access.log.pos
  tag       s3.access
</source>

<source>
  @type     tail
  format    ltsv
  time_key  time
  path      /var/log/xxx/action/xxx-api.action.log
  pos_file  /var/log/xxx/action/xxx-api.action.log.pos
  tag       s3.action
</source>

<match s3.*>
  type    forest
  subtype s3

  <template>
    s3_region          ap-northeast-1
    s3_bucket          production-xxx-log
    path               xxx-api/${tag_parts[1]}/%Y/%m/%d/%H/${tag_parts[1]}-%{hostname}_

    buffer_type        file
    buffer_path        /var/log/fluent/${tag}

    time_slice_format  %Y%m%d%H
    utc

    flush_at_shutdown  true

    include_time_key   true
    format             ltsv
  </template>

  <case s3.app>
    include_time_key   true
    format             single_value
  </case>

</match>